]> git.pld-linux.org Git - packages/kernel.git/blob - xen-3.0.4-2.6.19.patch
- http://download.filesystems.org/unionfs/unionfs-2.1/unionfs-2.1.4_for_2.6.22.6...
[packages/kernel.git] / xen-3.0.4-2.6.19.patch
1 diff -ruNp linux-2.6.19/arch/i386/Kconfig linux-2.6.19-xen-3.0.4/arch/i386/Kconfig
2 --- linux-2.6.19/arch/i386/Kconfig      2006-11-29 21:57:37.000000000 +0000
3 +++ linux-2.6.19-xen-3.0.4/arch/i386/Kconfig    2007-02-02 19:10:20.000000000 +0000
4 @@ -16,6 +16,7 @@ config X86_32
5  
6  config GENERIC_TIME
7         bool
8 +       depends on !X86_XEN
9         default y
10  
11  config LOCKDEP_SUPPORT
12 @@ -103,6 +104,15 @@ config X86_PC
13         help
14           Choose this option if your computer is a standard PC or compatible.
15  
16 +config X86_XEN
17 +       bool "Xen-compatible"
18 +       select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
19 +       select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
20 +       select SWIOTLB
21 +       help
22 +         Choose this option if you plan to run this kernel on top of the
23 +         Xen Hypervisor.
24 +
25  config X86_ELAN
26         bool "AMD Elan"
27         help
28 @@ -212,6 +222,7 @@ source "arch/i386/Kconfig.cpu"
29  
30  config HPET_TIMER
31         bool "HPET Timer Support"
32 +       depends on !X86_XEN
33         help
34           This enables the use of the HPET for the kernel's internal timer.
35           HPET is the next generation timer replacing legacy 8254s.
36 @@ -262,7 +273,7 @@ source "kernel/Kconfig.preempt"
37  
38  config X86_UP_APIC
39         bool "Local APIC support on uniprocessors"
40 -       depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH)
41 +       depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH || XEN_UNPRIVILEGED_GUEST)
42         help
43           A local APIC (Advanced Programmable Interrupt Controller) is an
44           integrated interrupt controller in the CPU. If you have a single-CPU
45 @@ -287,12 +298,12 @@ config X86_UP_IOAPIC
46  
47  config X86_LOCAL_APIC
48         bool
49 -       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH
50 +       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) || X86_GENERICARCH
51         default y
52  
53  config X86_IO_APIC
54         bool
55 -       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH
56 +       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) || X86_GENERICARCH
57         default y
58  
59  config X86_VISWS_APIC
60 @@ -302,7 +313,7 @@ config X86_VISWS_APIC
61  
62  config X86_MCE
63         bool "Machine Check Exception"
64 -       depends on !X86_VOYAGER
65 +       depends on !(X86_VOYAGER || X86_XEN)
66         ---help---
67           Machine Check Exception support allows the processor to notify the
68           kernel if it detects a problem (e.g. overheating, component failure).
69 @@ -401,6 +412,7 @@ config X86_REBOOTFIXUPS
70  
71  config MICROCODE
72         tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
73 +       depends on !XEN_UNPRIVILEGED_GUEST
74         select FW_LOADER
75         ---help---
76           If you say Y here and also to "/dev file system support" in the
77 @@ -424,6 +436,7 @@ config MICROCODE_OLD_INTERFACE
78  
79  config X86_MSR
80         tristate "/dev/cpu/*/msr - Model-specific register support"
81 +       depends on !X86_XEN
82         help
83           This device gives privileged processes access to the x86
84           Model-Specific Registers (MSRs).  It is a character device with
85 @@ -439,6 +452,10 @@ config X86_CPUID
86           with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
87           /dev/cpu/31/cpuid.
88  
89 +config SWIOTLB
90 +       bool
91 +       default n
92 +
93  source "drivers/firmware/Kconfig"
94  
95  choice
96 @@ -611,7 +628,7 @@ source "mm/Kconfig"
97  
98  config HIGHPTE
99         bool "Allocate 3rd-level pagetables from highmem"
100 -       depends on HIGHMEM4G || HIGHMEM64G
101 +       depends on (HIGHMEM4G || HIGHMEM64G) && !X86_XEN
102         help
103           The VM uses one page table entry for each page of physical memory.
104           For systems with a lot of RAM, this can be wasteful of precious
105 @@ -620,6 +637,7 @@ config HIGHPTE
106  
107  config MATH_EMULATION
108         bool "Math emulation"
109 +       depends on !X86_XEN
110         ---help---
111           Linux can emulate a math coprocessor (used for floating point
112           operations) if you don't have one. 486DX and Pentium processors have
113 @@ -645,6 +663,8 @@ config MATH_EMULATION
114  
115  config MTRR
116         bool "MTRR (Memory Type Range Register) support"
117 +       depends on !XEN_UNPRIVILEGED_GUEST
118 +       default y if X86_XEN
119         ---help---
120           On Intel P6 family processors (Pentium Pro, Pentium II and later)
121           the Memory Type Range Registers (MTRRs) may be used to control
122 @@ -679,7 +699,7 @@ config MTRR
123  
124  config EFI
125         bool "Boot from EFI support"
126 -       depends on ACPI
127 +       depends on ACPI && !X86_XEN
128         default n
129         ---help---
130         This enables the kernel to boot on EFI platforms using
131 @@ -697,7 +717,7 @@ config EFI
132  
133  config IRQBALANCE
134         bool "Enable kernel irq balancing"
135 -       depends on SMP && X86_IO_APIC
136 +       depends on SMP && X86_IO_APIC && !X86_XEN
137         default y
138         help
139           The default yes will allow the kernel to do irq load balancing.
140 @@ -745,6 +765,7 @@ source kernel/Kconfig.hz
141  
142  config KEXEC
143         bool "kexec system call"
144 +       depends on !XEN_UNPRIVILEGED_GUEST
145         help
146           kexec is a system call that implements the ability to shutdown your
147           current kernel, and to start another kernel.  It is like a reboot
148 @@ -804,7 +825,7 @@ config HOTPLUG_CPU
149  config COMPAT_VDSO
150         bool "Compat VDSO support"
151         default y
152 -       depends on !PARAVIRT
153 +       depends on !X86_XEN
154         help
155           Map the VDSO to the predictable old-style address too.
156         ---help---
157 @@ -821,18 +842,20 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
158         depends on HIGHMEM
159  
160  menu "Power management options (ACPI, APM)"
161 -       depends on !X86_VOYAGER
162 +       depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
163  
164 +if !X86_XEN
165  source kernel/power/Kconfig
166 +endif
167  
168  source "drivers/acpi/Kconfig"
169  
170  menu "APM (Advanced Power Management) BIOS Support"
171 -depends on PM && !X86_VISWS
172 +depends on PM && !(X86_VISWS || X86_XEN)
173  
174  config APM
175         tristate "APM (Advanced Power Management) BIOS support"
176 -       depends on PM
177 +       depends on PM && PM_LEGACY
178         ---help---
179           APM is a BIOS specification for saving power using several different
180           techniques. This is mostly useful for battery powered laptops with
181 @@ -1017,6 +1040,7 @@ choice
182  
183  config PCI_GOBIOS
184         bool "BIOS"
185 +       depends on !X86_XEN
186  
187  config PCI_GOMMCONFIG
188         bool "MMConfig"
189 @@ -1024,6 +1048,13 @@ config PCI_GOMMCONFIG
190  config PCI_GODIRECT
191         bool "Direct"
192  
193 +config PCI_GOXEN_FE
194 +       bool "Xen PCI Frontend"
195 +       depends on X86_XEN
196 +       help
197 +         The PCI device frontend driver allows the kernel to import arbitrary
198 +         PCI devices from a PCI backend to support PCI driver domains.
199 +
200  config PCI_GOANY
201         bool "Any"
202  
203 @@ -1031,7 +1062,7 @@ endchoice
204  
205  config PCI_BIOS
206         bool
207 -       depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
208 +       depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY)
209         default y
210  
211  config PCI_DIRECT
212 @@ -1044,6 +1075,18 @@ config PCI_MMCONFIG
213         depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
214         default y
215  
216 +config XEN_PCIDEV_FRONTEND
217 +       bool
218 +       depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY)
219 +       default y
220 +
221 +config XEN_PCIDEV_FE_DEBUG
222 +       bool "Xen PCI Frontend Debugging"
223 +       depends on XEN_PCIDEV_FRONTEND
224 +       default n
225 +       help
226 +         Enables some debug statements within the PCI Frontend.
227 +
228  source "drivers/pci/pcie/Kconfig"
229  
230  source "drivers/pci/Kconfig"
231 @@ -1054,7 +1097,7 @@ config ISA_DMA_API
232  
233  config ISA
234         bool "ISA support"
235 -       depends on !(X86_VOYAGER || X86_VISWS)
236 +       depends on !(X86_VOYAGER || X86_VISWS || X86_XEN)
237         help
238           Find out whether you have ISA slots on your motherboard.  ISA is the
239           name of a bus system, i.e. the way the CPU talks to the other stuff
240 @@ -1081,7 +1124,7 @@ config EISA
241  source "drivers/eisa/Kconfig"
242  
243  config MCA
244 -       bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
245 +       bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN)
246         default y if X86_VOYAGER
247         help
248           MicroChannel Architecture is found in some IBM PS/2 machines and
249 @@ -1157,6 +1200,8 @@ source "security/Kconfig"
250  
251  source "crypto/Kconfig"
252  
253 +source "drivers/xen/Kconfig"
254 +
255  source "lib/Kconfig"
256  
257  #
258 @@ -1182,7 +1227,7 @@ config X86_SMP
259  
260  config X86_HT
261         bool
262 -       depends on SMP && !(X86_VISWS || X86_VOYAGER)
263 +       depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN)
264         default y
265  
266  config X86_BIOS_REBOOT
267 @@ -1195,6 +1240,16 @@ config X86_TRAMPOLINE
268         depends on X86_SMP || (X86_VOYAGER && SMP)
269         default y
270  
271 +config X86_NO_TSS
272 +       bool
273 +       depends on X86_XEN
274 +       default y
275 +
276 +config X86_NO_IDT
277 +       bool
278 +       depends on X86_XEN
279 +       default y
280 +
281  config KTIME_SCALAR
282         bool
283         default y
284 diff -ruNp linux-2.6.19/arch/i386/Kconfig.cpu linux-2.6.19-xen-3.0.4/arch/i386/Kconfig.cpu
285 --- linux-2.6.19/arch/i386/Kconfig.cpu  2006-11-29 21:57:37.000000000 +0000
286 +++ linux-2.6.19-xen-3.0.4/arch/i386/Kconfig.cpu        2007-02-02 19:10:20.000000000 +0000
287 @@ -252,7 +252,7 @@ config X86_PPRO_FENCE
288  
289  config X86_F00F_BUG
290         bool
291 -       depends on M586MMX || M586TSC || M586 || M486 || M386
292 +       depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT
293         default y
294  
295  config X86_WP_WORKS_OK
296 @@ -312,5 +312,5 @@ config X86_OOSTORE
297  
298  config X86_TSC
299         bool
300 -       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ
301 +       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ && !X86_XEN
302         default y
303 diff -ruNp linux-2.6.19/arch/i386/Kconfig.debug linux-2.6.19-xen-3.0.4/arch/i386/Kconfig.debug
304 --- linux-2.6.19/arch/i386/Kconfig.debug        2006-11-29 21:57:37.000000000 +0000
305 +++ linux-2.6.19-xen-3.0.4/arch/i386/Kconfig.debug      2007-02-02 19:10:20.000000000 +0000
306 @@ -79,6 +79,7 @@ config X86_MPPARSE
307  config DOUBLEFAULT
308         default y
309         bool "Enable doublefault exception handler" if EMBEDDED
310 +       depends on !X86_NO_TSS
311         help
312            This option allows trapping of rare doublefault exceptions that
313            would otherwise cause a system to silently reboot. Disabling this
314 diff -ruNp linux-2.6.19/arch/i386/Makefile linux-2.6.19-xen-3.0.4/arch/i386/Makefile
315 --- linux-2.6.19/arch/i386/Makefile     2006-11-29 21:57:37.000000000 +0000
316 +++ linux-2.6.19-xen-3.0.4/arch/i386/Makefile   2007-02-02 19:10:20.000000000 +0000
317 @@ -60,6 +60,11 @@ AFLAGS += $(call as-instr,.cfi_startproc
318  
319  CFLAGS += $(cflags-y)
320  
321 +cppflags-$(CONFIG_XEN) += \
322 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
323 +
324 +CPPFLAGS += $(cppflags-y)
325 +
326  # Default subarch .c files
327  mcore-y  := mach-default
328  
329 @@ -83,6 +88,10 @@ mcore-$(CONFIG_X86_BIGSMP)   := mach-defau
330  mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
331  mcore-$(CONFIG_X86_SUMMIT)  := mach-default
332  
333 +# Xen subarch support
334 +mflags-$(CONFIG_X86_XEN)       := -Iinclude/asm-i386/mach-xen
335 +mcore-$(CONFIG_X86_XEN)                := mach-xen
336 +
337  # generic subarchitecture
338  mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
339  mcore-$(CONFIG_X86_GENERICARCH) := mach-default
340 @@ -117,6 +126,19 @@ boot := arch/i386/boot
341  PHONY += zImage bzImage compressed zlilo bzlilo \
342           zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
343  
344 +ifdef CONFIG_XEN
345 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
346 +head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task-xen.o
347 +boot := arch/i386/boot-xen
348 +.PHONY: vmlinuz
349 +all: vmlinuz
350 +
351 +vmlinuz: vmlinux
352 +       $(Q)$(MAKE) $(build)=$(boot) $@
353 +
354 +install:
355 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
356 +else
357  all: bzImage
358  
359  # KBUILD_IMAGE specify target image being built
360 @@ -139,6 +161,7 @@ fdimage fdimage144 fdimage288 isoimage: 
361  
362  install:
363         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
364 +endif
365  
366  archclean:
367         $(Q)$(MAKE) $(clean)=arch/i386/boot
368 @@ -157,3 +180,4 @@ endef
369  CLEAN_FILES += arch/$(ARCH)/boot/fdimage \
370                arch/$(ARCH)/boot/image.iso \
371                arch/$(ARCH)/boot/mtools.conf
372 +CLEAN_FILES += vmlinuz vmlinux-stripped
373 diff -ruNp linux-2.6.19/arch/i386/boot-xen/Makefile linux-2.6.19-xen-3.0.4/arch/i386/boot-xen/Makefile
374 --- linux-2.6.19/arch/i386/boot-xen/Makefile    1970-01-01 00:00:00.000000000 +0000
375 +++ linux-2.6.19-xen-3.0.4/arch/i386/boot-xen/Makefile  2007-02-02 19:10:20.000000000 +0000
376 @@ -0,0 +1,21 @@
377 +
378 +OBJCOPYFLAGS := -g --strip-unneeded
379 +
380 +vmlinuz: vmlinux-stripped FORCE
381 +       $(call if_changed,gzip)
382 +
383 +vmlinux-stripped: vmlinux FORCE
384 +       $(call if_changed,objcopy)
385 +
386 +INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH))
387 +
388 +XINSTALL_NAME ?= $(KERNELRELEASE)
389 +install:
390 +       mkdir -p $(INSTALL_ROOT)/boot
391 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
392 +       rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
393 +       install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
394 +       install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
395 +       install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
396 +       install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
397 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
398 diff -ruNp linux-2.6.19/arch/i386/kernel/Makefile linux-2.6.19-xen-3.0.4/arch/i386/kernel/Makefile
399 --- linux-2.6.19/arch/i386/kernel/Makefile      2006-11-29 21:57:37.000000000 +0000
400 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/Makefile    2007-02-02 19:10:20.000000000 +0000
401 @@ -44,6 +44,12 @@ EXTRA_AFLAGS   := -traditional
402  
403  obj-$(CONFIG_SCx200)           += scx200.o
404  
405 +ifdef CONFIG_XEN
406 +vsyscall_note := vsyscall-note-xen.o
407 +else
408 +vsyscall_note := vsyscall-note.o
409 +endif
410 +
411  # vsyscall.o contains the vsyscall DSO images as __initdata.
412  # We must build both images before we can assemble it.
413  # Note: kbuild does not track this dependency due to usage of .incbin
414 @@ -65,7 +71,7 @@ SYSCFLAGS_vsyscall-int80.so   = $(vsyscall
415  
416  $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
417  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
418 -                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
419 +                     $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE
420         $(call if_changed,syscall)
421  
422  # We also create a special relocatable object that should mirror the symbol
423 @@ -77,9 +83,20 @@ $(obj)/built-in.o: ld_flags += -R $(obj)
424  
425  SYSCFLAGS_vsyscall-syms.o = -r
426  $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
427 -                       $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
428 +                       $(obj)/vsyscall-sysenter.o $(obj)/$(vsyscall_note) FORCE
429         $(call if_changed,syscall)
430  
431  k8-y                      += ../../x86_64/kernel/k8.o
432  stacktrace-y             += ../../x86_64/kernel/stacktrace.o
433  
434 +ifdef CONFIG_XEN
435 +include $(srctree)/scripts/Makefile.xen
436 +
437 +obj-y += fixup.o
438 +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
439 +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
440 +
441 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
442 +obj-y := $(call cherrypickxen, $(obj-y))
443 +extra-y := $(call cherrypickxen, $(extra-y))
444 +endif
445 diff -ruNp linux-2.6.19/arch/i386/kernel/acpi/Makefile linux-2.6.19-xen-3.0.4/arch/i386/kernel/acpi/Makefile
446 --- linux-2.6.19/arch/i386/kernel/acpi/Makefile 2006-11-29 21:57:37.000000000 +0000
447 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/acpi/Makefile       2007-02-02 19:10:20.000000000 +0000
448 @@ -8,3 +8,7 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),)
449  obj-y                          += cstate.o processor.o
450  endif
451  
452 +ifdef CONFIG_XEN
453 +include $(srctree)/scripts/Makefile.xen
454 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
455 +endif
456 diff -ruNp linux-2.6.19/arch/i386/kernel/acpi/boot-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/acpi/boot-xen.c
457 --- linux-2.6.19/arch/i386/kernel/acpi/boot-xen.c       1970-01-01 00:00:00.000000000 +0000
458 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/acpi/boot-xen.c     2007-02-02 19:10:20.000000000 +0000
459 @@ -0,0 +1,1332 @@
460 +/*
461 + *  boot.c - Architecture-Specific Low-Level ACPI Boot Support
462 + *
463 + *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
464 + *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
465 + *
466 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
467 + *
468 + *  This program is free software; you can redistribute it and/or modify
469 + *  it under the terms of the GNU General Public License as published by
470 + *  the Free Software Foundation; either version 2 of the License, or
471 + *  (at your option) any later version.
472 + *
473 + *  This program is distributed in the hope that it will be useful,
474 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
475 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
476 + *  GNU General Public License for more details.
477 + *
478 + *  You should have received a copy of the GNU General Public License
479 + *  along with this program; if not, write to the Free Software
480 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
481 + *
482 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
483 + */
484 +
485 +#include <linux/init.h>
486 +#include <linux/acpi.h>
487 +#include <linux/efi.h>
488 +#include <linux/cpumask.h>
489 +#include <linux/module.h>
490 +#include <linux/dmi.h>
491 +#include <linux/irq.h>
492 +#include <linux/bootmem.h>
493 +#include <linux/ioport.h>
494 +
495 +#include <asm/pgtable.h>
496 +#include <asm/io_apic.h>
497 +#include <asm/apic.h>
498 +#include <asm/io.h>
499 +#include <asm/mpspec.h>
500 +
501 +static int __initdata acpi_force = 0;
502 +
503 +#ifdef CONFIG_ACPI
504 +int acpi_disabled = 0;
505 +#else
506 +int acpi_disabled = 1;
507 +#endif
508 +EXPORT_SYMBOL(acpi_disabled);
509 +
510 +#ifdef CONFIG_X86_64
511 +
512 +#include <asm/proto.h>
513 +
514 +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
515 +
516 +
517 +#else                          /* X86 */
518 +
519 +#ifdef CONFIG_X86_LOCAL_APIC
520 +#include <mach_apic.h>
521 +#include <mach_mpparse.h>
522 +#endif                         /* CONFIG_X86_LOCAL_APIC */
523 +
524 +#endif                         /* X86 */
525 +
526 +#define BAD_MADT_ENTRY(entry, end) (                                       \
527 +               (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
528 +               ((acpi_table_entry_header *)entry)->length < sizeof(*entry))
529 +
530 +#define PREFIX                 "ACPI: "
531 +
532 +int acpi_noirq;                                /* skip ACPI IRQ initialization */
533 +int acpi_pci_disabled __initdata;      /* skip ACPI PCI scan and IRQ initialization */
534 +int acpi_ht __initdata = 1;    /* enable HT */
535 +
536 +int acpi_lapic;
537 +int acpi_ioapic;
538 +int acpi_strict;
539 +EXPORT_SYMBOL(acpi_strict);
540 +
541 +acpi_interrupt_flags acpi_sci_flags __initdata;
542 +int acpi_sci_override_gsi __initdata;
543 +int acpi_skip_timer_override __initdata;
544 +int acpi_use_timer_override __initdata;
545 +
546 +#ifdef CONFIG_X86_LOCAL_APIC
547 +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
548 +#endif
549 +
550 +#ifndef __HAVE_ARCH_CMPXCHG
551 +#warning ACPI uses CMPXCHG, i486 and later hardware
552 +#endif
553 +
554 +#define MAX_MADT_ENTRIES       256
555 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
556 +    {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
557 +EXPORT_SYMBOL(x86_acpiid_to_apicid);
558 +
559 +/* --------------------------------------------------------------------------
560 +                              Boot-time Configuration
561 +   -------------------------------------------------------------------------- */
562 +
563 +/*
564 + * The default interrupt routing model is PIC (8259).  This gets
565 + * overriden if IOAPICs are enumerated (below).
566 + */
567 +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
568 +
569 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
570 +
571 +/* rely on all ACPI tables being in the direct mapping */
572 +char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
573 +{
574 +       if (!phys_addr || !size)
575 +               return NULL;
576 +
577 +       if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
578 +               return __va(phys_addr);
579 +
580 +       return NULL;
581 +}
582 +
583 +#else
584 +
585 +/*
586 + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
587 + * to map the target physical address. The problem is that set_fixmap()
588 + * provides a single page, and it is possible that the page is not
589 + * sufficient.
590 + * By using this area, we can map up to MAX_IO_APICS pages temporarily,
591 + * i.e. until the next __va_range() call.
592 + *
593 + * Important Safety Note:  The fixed I/O APIC page numbers are *subtracted*
594 + * from the fixed base.  That's why we start at FIX_IO_APIC_BASE_END and
595 + * count idx down while incrementing the phys address.
596 + */
597 +char *__acpi_map_table(unsigned long phys, unsigned long size)
598 +{
599 +       unsigned long base, offset, mapped_size;
600 +       int idx;
601 +
602 +#ifndef CONFIG_XEN
603 +       if (phys + size < 8 * 1024 * 1024)
604 +               return __va(phys);
605 +#endif
606 +
607 +       offset = phys & (PAGE_SIZE - 1);
608 +       mapped_size = PAGE_SIZE - offset;
609 +       set_fixmap(FIX_ACPI_END, phys);
610 +       base = fix_to_virt(FIX_ACPI_END);
611 +
612 +       /*
613 +        * Most cases can be covered by the below.
614 +        */
615 +       idx = FIX_ACPI_END;
616 +       while (mapped_size < size) {
617 +               if (--idx < FIX_ACPI_BEGIN)
618 +                       return NULL;    /* cannot handle this */
619 +               phys += PAGE_SIZE;
620 +               set_fixmap(idx, phys);
621 +               mapped_size += PAGE_SIZE;
622 +       }
623 +
624 +       return ((unsigned char *)base + offset);
625 +}
626 +#endif
627 +
628 +#ifdef CONFIG_PCI_MMCONFIG
629 +/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
630 +struct acpi_table_mcfg_config *pci_mmcfg_config;
631 +int pci_mmcfg_config_num;
632 +
633 +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
634 +{
635 +       struct acpi_table_mcfg *mcfg;
636 +       unsigned long i;
637 +       int config_size;
638 +
639 +       if (!phys_addr || !size)
640 +               return -EINVAL;
641 +
642 +       mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
643 +       if (!mcfg) {
644 +               printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
645 +               return -ENODEV;
646 +       }
647 +
648 +       /* how many config structures do we have */
649 +       pci_mmcfg_config_num = 0;
650 +       i = size - sizeof(struct acpi_table_mcfg);
651 +       while (i >= sizeof(struct acpi_table_mcfg_config)) {
652 +               ++pci_mmcfg_config_num;
653 +               i -= sizeof(struct acpi_table_mcfg_config);
654 +       };
655 +       if (pci_mmcfg_config_num == 0) {
656 +               printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
657 +               return -ENODEV;
658 +       }
659 +
660 +       config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
661 +       pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
662 +       if (!pci_mmcfg_config) {
663 +               printk(KERN_WARNING PREFIX
664 +                      "No memory for MCFG config tables\n");
665 +               return -ENOMEM;
666 +       }
667 +
668 +       memcpy(pci_mmcfg_config, &mcfg->config, config_size);
669 +       for (i = 0; i < pci_mmcfg_config_num; ++i) {
670 +               if (mcfg->config[i].base_reserved) {
671 +                       printk(KERN_ERR PREFIX
672 +                              "MMCONFIG not in low 4GB of memory\n");
673 +                       kfree(pci_mmcfg_config);
674 +                       pci_mmcfg_config_num = 0;
675 +                       return -ENODEV;
676 +               }
677 +       }
678 +
679 +       return 0;
680 +}
681 +#endif                         /* CONFIG_PCI_MMCONFIG */
682 +
683 +#ifdef CONFIG_X86_LOCAL_APIC
684 +static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
685 +{
686 +       struct acpi_table_madt *madt = NULL;
687 +
688 +       if (!phys_addr || !size || !cpu_has_apic)
689 +               return -EINVAL;
690 +
691 +       madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
692 +       if (!madt) {
693 +               printk(KERN_WARNING PREFIX "Unable to map MADT\n");
694 +               return -ENODEV;
695 +       }
696 +
697 +       if (madt->lapic_address) {
698 +               acpi_lapic_addr = (u64) madt->lapic_address;
699 +
700 +               printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
701 +                      madt->lapic_address);
702 +       }
703 +
704 +       acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
705 +
706 +       return 0;
707 +}
708 +
709 +static int __init
710 +acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
711 +{
712 +       struct acpi_table_lapic *processor = NULL;
713 +
714 +       processor = (struct acpi_table_lapic *)header;
715 +
716 +       if (BAD_MADT_ENTRY(processor, end))
717 +               return -EINVAL;
718 +
719 +       acpi_table_print_madt_entry(header);
720 +
721 +       /* Record local apic id only when enabled */
722 +       if (processor->flags.enabled)
723 +               x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
724 +
725 +       /*
726 +        * We need to register disabled CPU as well to permit
727 +        * counting disabled CPUs. This allows us to size
728 +        * cpus_possible_map more accurately, to permit
729 +        * to not preallocating memory for all NR_CPUS
730 +        * when we use CPU hotplug.
731 +        */
732 +       mp_register_lapic(processor->id,        /* APIC ID */
733 +                         processor->flags.enabled);    /* Enabled? */
734 +
735 +       return 0;
736 +}
737 +
738 +static int __init
739 +acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
740 +                         const unsigned long end)
741 +{
742 +       struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
743 +
744 +       lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
745 +
746 +       if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
747 +               return -EINVAL;
748 +
749 +       acpi_lapic_addr = lapic_addr_ovr->address;
750 +
751 +       return 0;
752 +}
753 +
754 +static int __init
755 +acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
756 +{
757 +       struct acpi_table_lapic_nmi *lapic_nmi = NULL;
758 +
759 +       lapic_nmi = (struct acpi_table_lapic_nmi *)header;
760 +
761 +       if (BAD_MADT_ENTRY(lapic_nmi, end))
762 +               return -EINVAL;
763 +
764 +       acpi_table_print_madt_entry(header);
765 +
766 +       if (lapic_nmi->lint != 1)
767 +               printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
768 +
769 +       return 0;
770 +}
771 +
772 +#endif                         /*CONFIG_X86_LOCAL_APIC */
773 +
774 +#ifdef CONFIG_X86_IO_APIC
775 +
776 +static int __init
777 +acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
778 +{
779 +       struct acpi_table_ioapic *ioapic = NULL;
780 +
781 +       ioapic = (struct acpi_table_ioapic *)header;
782 +
783 +       if (BAD_MADT_ENTRY(ioapic, end))
784 +               return -EINVAL;
785 +
786 +       acpi_table_print_madt_entry(header);
787 +
788 +       mp_register_ioapic(ioapic->id,
789 +                          ioapic->address, ioapic->global_irq_base);
790 +
791 +       return 0;
792 +}
793 +
794 +/*
795 + * Parse Interrupt Source Override for the ACPI SCI
796 + */
797 +static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
798 +{
799 +       if (trigger == 0)       /* compatible SCI trigger is level */
800 +               trigger = 3;
801 +
802 +       if (polarity == 0)      /* compatible SCI polarity is low */
803 +               polarity = 3;
804 +
805 +       /* Command-line over-ride via acpi_sci= */
806 +       if (acpi_sci_flags.trigger)
807 +               trigger = acpi_sci_flags.trigger;
808 +
809 +       if (acpi_sci_flags.polarity)
810 +               polarity = acpi_sci_flags.polarity;
811 +
812 +       /*
813 +        * mp_config_acpi_legacy_irqs() already setup IRQs < 16
814 +        * If GSI is < 16, this will update its flags,
815 +        * else it will create a new mp_irqs[] entry.
816 +        */
817 +       mp_override_legacy_irq(gsi, polarity, trigger, gsi);
818 +
819 +       /*
820 +        * stash over-ride to indicate we've been here
821 +        * and for later update of acpi_fadt
822 +        */
823 +       acpi_sci_override_gsi = gsi;
824 +       return;
825 +}
826 +
827 +static int __init
828 +acpi_parse_int_src_ovr(acpi_table_entry_header * header,
829 +                      const unsigned long end)
830 +{
831 +       struct acpi_table_int_src_ovr *intsrc = NULL;
832 +
833 +       intsrc = (struct acpi_table_int_src_ovr *)header;
834 +
835 +       if (BAD_MADT_ENTRY(intsrc, end))
836 +               return -EINVAL;
837 +
838 +       acpi_table_print_madt_entry(header);
839 +
840 +       if (intsrc->bus_irq == acpi_fadt.sci_int) {
841 +               acpi_sci_ioapic_setup(intsrc->global_irq,
842 +                                     intsrc->flags.polarity,
843 +                                     intsrc->flags.trigger);
844 +               return 0;
845 +       }
846 +
847 +       if (acpi_skip_timer_override &&
848 +           intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
849 +               printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
850 +               return 0;
851 +       }
852 +
853 +       mp_override_legacy_irq(intsrc->bus_irq,
854 +                              intsrc->flags.polarity,
855 +                              intsrc->flags.trigger, intsrc->global_irq);
856 +
857 +       return 0;
858 +}
859 +
860 +static int __init
861 +acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
862 +{
863 +       struct acpi_table_nmi_src *nmi_src = NULL;
864 +
865 +       nmi_src = (struct acpi_table_nmi_src *)header;
866 +
867 +       if (BAD_MADT_ENTRY(nmi_src, end))
868 +               return -EINVAL;
869 +
870 +       acpi_table_print_madt_entry(header);
871 +
872 +       /* TBD: Support nimsrc entries? */
873 +
874 +       return 0;
875 +}
876 +
877 +#endif                         /* CONFIG_X86_IO_APIC */
878 +
879 +/*
880 + * acpi_pic_sci_set_trigger()
881 + * 
882 + * use ELCR to set PIC-mode trigger type for SCI
883 + *
884 + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
885 + * it may require Edge Trigger -- use "acpi_sci=edge"
886 + *
887 + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
888 + * for the 8259 PIC.  bit[n] = 1 means irq[n] is Level, otherwise Edge.
889 + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
890 + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
891 + */
892 +
893 +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
894 +{
895 +       unsigned int mask = 1 << irq;
896 +       unsigned int old, new;
897 +
898 +       /* Real old ELCR mask */
899 +       old = inb(0x4d0) | (inb(0x4d1) << 8);
900 +
901 +       /*
902 +        * If we use ACPI to set PCI irq's, then we should clear ELCR
903 +        * since we will set it correctly as we enable the PCI irq
904 +        * routing.
905 +        */
906 +       new = acpi_noirq ? old : 0;
907 +
908 +       /*
909 +        * Update SCI information in the ELCR, it isn't in the PCI
910 +        * routing tables..
911 +        */
912 +       switch (trigger) {
913 +       case 1:         /* Edge - clear */
914 +               new &= ~mask;
915 +               break;
916 +       case 3:         /* Level - set */
917 +               new |= mask;
918 +               break;
919 +       }
920 +
921 +       if (old == new)
922 +               return;
923 +
924 +       printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
925 +       outb(new, 0x4d0);
926 +       outb(new >> 8, 0x4d1);
927 +}
928 +
929 +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
930 +{
931 +       *irq = gsi;
932 +       return 0;
933 +}
934 +
935 +/*
936 + * success: return IRQ number (>=0)
937 + * failure: return < 0
938 + */
939 +int acpi_register_gsi(u32 gsi, int triggering, int polarity)
940 +{
941 +       unsigned int irq;
942 +       unsigned int plat_gsi = gsi;
943 +
944 +#ifdef CONFIG_PCI
945 +       /*
946 +        * Make sure all (legacy) PCI IRQs are set as level-triggered.
947 +        */
948 +       if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
949 +               extern void eisa_set_level_irq(unsigned int irq);
950 +
951 +               if (triggering == ACPI_LEVEL_SENSITIVE)
952 +                       eisa_set_level_irq(gsi);
953 +       }
954 +#endif
955 +
956 +#ifdef CONFIG_X86_IO_APIC
957 +       if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
958 +               plat_gsi = mp_register_gsi(gsi, triggering, polarity);
959 +       }
960 +#endif
961 +       acpi_gsi_to_irq(plat_gsi, &irq);
962 +       return irq;
963 +}
964 +
965 +EXPORT_SYMBOL(acpi_register_gsi);
966 +
967 +/*
968 + *  ACPI based hotplug support for CPU
969 + */
970 +#ifdef CONFIG_ACPI_HOTPLUG_CPU
971 +int acpi_map_lsapic(acpi_handle handle, int *pcpu)
972 +{
973 +       struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
974 +       union acpi_object *obj;
975 +       struct acpi_table_lapic *lapic;
976 +       cpumask_t tmp_map, new_map;
977 +       u8 physid;
978 +       int cpu;
979 +
980 +       if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
981 +               return -EINVAL;
982 +
983 +       if (!buffer.length || !buffer.pointer)
984 +               return -EINVAL;
985 +
986 +       obj = buffer.pointer;
987 +       if (obj->type != ACPI_TYPE_BUFFER ||
988 +           obj->buffer.length < sizeof(*lapic)) {
989 +               kfree(buffer.pointer);
990 +               return -EINVAL;
991 +       }
992 +
993 +       lapic = (struct acpi_table_lapic *)obj->buffer.pointer;
994 +
995 +       if ((lapic->header.type != ACPI_MADT_LAPIC) ||
996 +           (!lapic->flags.enabled)) {
997 +               kfree(buffer.pointer);
998 +               return -EINVAL;
999 +       }
1000 +
1001 +       physid = lapic->id;
1002 +
1003 +       kfree(buffer.pointer);
1004 +       buffer.length = ACPI_ALLOCATE_BUFFER;
1005 +       buffer.pointer = NULL;
1006 +
1007 +       tmp_map = cpu_present_map;
1008 +       mp_register_lapic(physid, lapic->flags.enabled);
1009 +
1010 +       /*
1011 +        * If mp_register_lapic successfully generates a new logical cpu
1012 +        * number, then the following will get us exactly what was mapped
1013 +        */
1014 +       cpus_andnot(new_map, cpu_present_map, tmp_map);
1015 +       if (cpus_empty(new_map)) {
1016 +               printk ("Unable to map lapic to logical cpu number\n");
1017 +               return -EINVAL;
1018 +       }
1019 +
1020 +       cpu = first_cpu(new_map);
1021 +
1022 +       *pcpu = cpu;
1023 +       return 0;
1024 +}
1025 +
1026 +EXPORT_SYMBOL(acpi_map_lsapic);
1027 +
1028 +int acpi_unmap_lsapic(int cpu)
1029 +{
1030 +       int i;
1031 +
1032 +       for_each_possible_cpu(i) {
1033 +               if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) {
1034 +                       x86_acpiid_to_apicid[i] = -1;
1035 +                       break;
1036 +               }
1037 +       }
1038 +       x86_cpu_to_apicid[cpu] = -1;
1039 +       cpu_clear(cpu, cpu_present_map);
1040 +       num_processors--;
1041 +
1042 +       return (0);
1043 +}
1044 +
1045 +EXPORT_SYMBOL(acpi_unmap_lsapic);
1046 +#endif                         /* CONFIG_ACPI_HOTPLUG_CPU */
1047 +
1048 +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
1049 +{
1050 +       /* TBD */
1051 +       return -EINVAL;
1052 +}
1053 +
1054 +EXPORT_SYMBOL(acpi_register_ioapic);
1055 +
1056 +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
1057 +{
1058 +       /* TBD */
1059 +       return -EINVAL;
1060 +}
1061 +
1062 +EXPORT_SYMBOL(acpi_unregister_ioapic);
1063 +
1064 +static unsigned long __init
1065 +acpi_scan_rsdp(unsigned long start, unsigned long length)
1066 +{
1067 +       unsigned long offset = 0;
1068 +       unsigned long sig_len = sizeof("RSD PTR ") - 1;
1069 +       unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
1070 +
1071 +       /*
1072 +        * Scan all 16-byte boundaries of the physical memory region for the
1073 +        * RSDP signature.
1074 +        */
1075 +       for (offset = 0; offset < length; offset += 16) {
1076 +               if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len))
1077 +                       continue;
1078 +               return (start + offset);
1079 +       }
1080 +
1081 +       return 0;
1082 +}
1083 +
1084 +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
1085 +{
1086 +       struct acpi_table_sbf *sb;
1087 +
1088 +       if (!phys_addr || !size)
1089 +               return -EINVAL;
1090 +
1091 +       sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
1092 +       if (!sb) {
1093 +               printk(KERN_WARNING PREFIX "Unable to map SBF\n");
1094 +               return -ENODEV;
1095 +       }
1096 +
1097 +       sbf_port = sb->sbf_cmos;        /* Save CMOS port */
1098 +
1099 +       return 0;
1100 +}
1101 +
1102 +#ifdef CONFIG_HPET_TIMER
1103 +
1104 +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
1105 +{
1106 +       struct acpi_table_hpet *hpet_tbl;
1107 +       struct resource *hpet_res;
1108 +       resource_size_t res_start;
1109 +
1110 +       if (!phys || !size)
1111 +               return -EINVAL;
1112 +
1113 +       hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
1114 +       if (!hpet_tbl) {
1115 +               printk(KERN_WARNING PREFIX "Unable to map HPET\n");
1116 +               return -ENODEV;
1117 +       }
1118 +
1119 +       if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
1120 +               printk(KERN_WARNING PREFIX "HPET timers must be located in "
1121 +                      "memory.\n");
1122 +               return -1;
1123 +       }
1124 +
1125 +#define HPET_RESOURCE_NAME_SIZE 9
1126 +       hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
1127 +       if (hpet_res) {
1128 +               memset(hpet_res, 0, sizeof(*hpet_res));
1129 +               hpet_res->name = (void *)&hpet_res[1];
1130 +               hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1131 +               snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE,
1132 +                        "HPET %u", hpet_tbl->number);
1133 +               hpet_res->end = (1 * 1024) - 1;
1134 +       }
1135 +
1136 +#ifdef CONFIG_X86_64
1137 +       vxtime.hpet_address = hpet_tbl->addr.addrl |
1138 +           ((long)hpet_tbl->addr.addrh << 32);
1139 +
1140 +       printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1141 +              hpet_tbl->id, vxtime.hpet_address);
1142 +
1143 +       res_start = vxtime.hpet_address;
1144 +#else                          /* X86 */
1145 +       {
1146 +               extern unsigned long hpet_address;
1147 +
1148 +               hpet_address = hpet_tbl->addr.addrl;
1149 +               printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1150 +                      hpet_tbl->id, hpet_address);
1151 +
1152 +               res_start = hpet_address;
1153 +       }
1154 +#endif                         /* X86 */
1155 +
1156 +       if (hpet_res) {
1157 +               hpet_res->start = res_start;
1158 +               hpet_res->end += res_start;
1159 +               insert_resource(&iomem_resource, hpet_res);
1160 +       }
1161 +
1162 +       return 0;
1163 +}
1164 +#else
1165 +#define        acpi_parse_hpet NULL
1166 +#endif
1167 +
1168 +#ifdef CONFIG_X86_PM_TIMER
1169 +extern u32 pmtmr_ioport;
1170 +#endif
1171 +
1172 +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
1173 +{
1174 +       struct fadt_descriptor *fadt = NULL;
1175 +
1176 +       fadt = (struct fadt_descriptor *)__acpi_map_table(phys, size);
1177 +       if (!fadt) {
1178 +               printk(KERN_WARNING PREFIX "Unable to map FADT\n");
1179 +               return 0;
1180 +       }
1181 +       /* initialize sci_int early for INT_SRC_OVR MADT parsing */
1182 +       acpi_fadt.sci_int = fadt->sci_int;
1183 +
1184 +       /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
1185 +       acpi_fadt.revision = fadt->revision;
1186 +       acpi_fadt.force_apic_physical_destination_mode =
1187 +           fadt->force_apic_physical_destination_mode;
1188 +
1189 +#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN)
1190 +       /* detect the location of the ACPI PM Timer */
1191 +       if (fadt->revision >= FADT2_REVISION_ID) {
1192 +               /* FADT rev. 2 */
1193 +               if (fadt->xpm_tmr_blk.address_space_id !=
1194 +                   ACPI_ADR_SPACE_SYSTEM_IO)
1195 +                       return 0;
1196 +
1197 +               pmtmr_ioport = fadt->xpm_tmr_blk.address;
1198 +               /*
1199 +                * "X" fields are optional extensions to the original V1.0
1200 +                * fields, so we must selectively expand V1.0 fields if the
1201 +                * corresponding X field is zero.
1202 +                */
1203 +               if (!pmtmr_ioport)
1204 +                       pmtmr_ioport = fadt->V1_pm_tmr_blk;
1205 +       } else {
1206 +               /* FADT rev. 1 */
1207 +               pmtmr_ioport = fadt->V1_pm_tmr_blk;
1208 +       }
1209 +       if (pmtmr_ioport)
1210 +               printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
1211 +                      pmtmr_ioport);
1212 +#endif
1213 +       return 0;
1214 +}
1215 +
1216 +unsigned long __init acpi_find_rsdp(void)
1217 +{
1218 +       unsigned long rsdp_phys = 0;
1219 +
1220 +       if (efi_enabled) {
1221 +               if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
1222 +                       return efi.acpi20;
1223 +               else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
1224 +                       return efi.acpi;
1225 +       }
1226 +       /*
1227 +        * Scan memory looking for the RSDP signature. First search EBDA (low
1228 +        * memory) paragraphs and then search upper memory (E0000-FFFFF).
1229 +        */
1230 +       rsdp_phys = acpi_scan_rsdp(0, 0x400);
1231 +       if (!rsdp_phys)
1232 +               rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
1233 +
1234 +       return rsdp_phys;
1235 +}
1236 +
1237 +#ifdef CONFIG_X86_LOCAL_APIC
1238 +/*
1239 + * Parse LAPIC entries in MADT
1240 + * returns 0 on success, < 0 on error
1241 + */
1242 +static int __init acpi_parse_madt_lapic_entries(void)
1243 +{
1244 +       int count;
1245 +
1246 +       if (!cpu_has_apic)
1247 +               return -ENODEV;
1248 +
1249 +       /* 
1250 +        * Note that the LAPIC address is obtained from the MADT (32-bit value)
1251 +        * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
1252 +        */
1253 +
1254 +       count =
1255 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
1256 +                                 acpi_parse_lapic_addr_ovr, 0);
1257 +       if (count < 0) {
1258 +               printk(KERN_ERR PREFIX
1259 +                      "Error parsing LAPIC address override entry\n");
1260 +               return count;
1261 +       }
1262 +
1263 +       mp_register_lapic_address(acpi_lapic_addr);
1264 +
1265 +       count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
1266 +                                     MAX_APICS);
1267 +       if (!count) {
1268 +               printk(KERN_ERR PREFIX "No LAPIC entries present\n");
1269 +               /* TBD: Cleanup to allow fallback to MPS */
1270 +               return -ENODEV;
1271 +       } else if (count < 0) {
1272 +               printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
1273 +               /* TBD: Cleanup to allow fallback to MPS */
1274 +               return count;
1275 +       }
1276 +
1277 +       count =
1278 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
1279 +       if (count < 0) {
1280 +               printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
1281 +               /* TBD: Cleanup to allow fallback to MPS */
1282 +               return count;
1283 +       }
1284 +       return 0;
1285 +}
1286 +#endif                         /* CONFIG_X86_LOCAL_APIC */
1287 +
1288 +#ifdef CONFIG_X86_IO_APIC
1289 +/*
1290 + * Parse IOAPIC related entries in MADT
1291 + * returns 0 on success, < 0 on error
1292 + */
1293 +static int __init acpi_parse_madt_ioapic_entries(void)
1294 +{
1295 +       int count;
1296 +
1297 +       /*
1298 +        * ACPI interpreter is required to complete interrupt setup,
1299 +        * so if it is off, don't enumerate the io-apics with ACPI.
1300 +        * If MPS is present, it will handle them,
1301 +        * otherwise the system will stay in PIC mode
1302 +        */
1303 +       if (acpi_disabled || acpi_noirq) {
1304 +               return -ENODEV;
1305 +       }
1306 +
1307 +       if (!cpu_has_apic) 
1308 +               return -ENODEV;
1309 +
1310 +       /*
1311 +        * if "noapic" boot option, don't look for IO-APICs
1312 +        */
1313 +       if (skip_ioapic_setup) {
1314 +               printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
1315 +                      "due to 'noapic' option.\n");
1316 +               return -ENODEV;
1317 +       }
1318 +
1319 +       count =
1320 +           acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
1321 +                                 MAX_IO_APICS);
1322 +       if (!count) {
1323 +               printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
1324 +               return -ENODEV;
1325 +       } else if (count < 0) {
1326 +               printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
1327 +               return count;
1328 +       }
1329 +
1330 +       count =
1331 +           acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
1332 +                                 NR_IRQ_VECTORS);
1333 +       if (count < 0) {
1334 +               printk(KERN_ERR PREFIX
1335 +                      "Error parsing interrupt source overrides entry\n");
1336 +               /* TBD: Cleanup to allow fallback to MPS */
1337 +               return count;
1338 +       }
1339 +
1340 +       /*
1341 +        * If BIOS did not supply an INT_SRC_OVR for the SCI
1342 +        * pretend we got one so we can set the SCI flags.
1343 +        */
1344 +       if (!acpi_sci_override_gsi)
1345 +               acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
1346 +
1347 +       /* Fill in identity legacy mapings where no override */
1348 +       mp_config_acpi_legacy_irqs();
1349 +
1350 +       count =
1351 +           acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
1352 +                                 NR_IRQ_VECTORS);
1353 +       if (count < 0) {
1354 +               printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1355 +               /* TBD: Cleanup to allow fallback to MPS */
1356 +               return count;
1357 +       }
1358 +
1359 +       return 0;
1360 +}
1361 +#else
1362 +static inline int acpi_parse_madt_ioapic_entries(void)
1363 +{
1364 +       return -1;
1365 +}
1366 +#endif /* !CONFIG_X86_IO_APIC */
1367 +
1368 +static void __init acpi_process_madt(void)
1369 +{
1370 +#ifdef CONFIG_X86_LOCAL_APIC
1371 +       int count, error;
1372 +
1373 +       count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
1374 +       if (count >= 1) {
1375 +
1376 +               /*
1377 +                * Parse MADT LAPIC entries
1378 +                */
1379 +               error = acpi_parse_madt_lapic_entries();
1380 +               if (!error) {
1381 +                       acpi_lapic = 1;
1382 +
1383 +#ifdef CONFIG_X86_GENERICARCH
1384 +                       generic_bigsmp_probe();
1385 +#endif
1386 +                       /*
1387 +                        * Parse MADT IO-APIC entries
1388 +                        */
1389 +                       error = acpi_parse_madt_ioapic_entries();
1390 +                       if (!error) {
1391 +                               acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
1392 +                               acpi_irq_balance_set(NULL);
1393 +                               acpi_ioapic = 1;
1394 +
1395 +                               smp_found_config = 1;
1396 +                               clustered_apic_check();
1397 +                       }
1398 +               }
1399 +               if (error == -EINVAL) {
1400 +                       /*
1401 +                        * Dell Precision Workstation 410, 610 come here.
1402 +                        */
1403 +                       printk(KERN_ERR PREFIX
1404 +                              "Invalid BIOS MADT, disabling ACPI\n");
1405 +                       disable_acpi();
1406 +               }
1407 +       }
1408 +#endif
1409 +       return;
1410 +}
1411 +
1412 +#ifdef __i386__
1413 +
1414 +static int __init disable_acpi_irq(struct dmi_system_id *d)
1415 +{
1416 +       if (!acpi_force) {
1417 +               printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
1418 +                      d->ident);
1419 +               acpi_noirq_set();
1420 +       }
1421 +       return 0;
1422 +}
1423 +
1424 +static int __init disable_acpi_pci(struct dmi_system_id *d)
1425 +{
1426 +       if (!acpi_force) {
1427 +               printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
1428 +                      d->ident);
1429 +               acpi_disable_pci();
1430 +       }
1431 +       return 0;
1432 +}
1433 +
1434 +static int __init dmi_disable_acpi(struct dmi_system_id *d)
1435 +{
1436 +       if (!acpi_force) {
1437 +               printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
1438 +               disable_acpi();
1439 +       } else {
1440 +               printk(KERN_NOTICE
1441 +                      "Warning: DMI blacklist says broken, but acpi forced\n");
1442 +       }
1443 +       return 0;
1444 +}
1445 +
1446 +/*
1447 + * Limit ACPI to CPU enumeration for HT
1448 + */
1449 +static int __init force_acpi_ht(struct dmi_system_id *d)
1450 +{
1451 +       if (!acpi_force) {
1452 +               printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1453 +                      d->ident);
1454 +               disable_acpi();
1455 +               acpi_ht = 1;
1456 +       } else {
1457 +               printk(KERN_NOTICE
1458 +                      "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1459 +       }
1460 +       return 0;
1461 +}
1462 +
1463 +/*
1464 + * If your system is blacklisted here, but you find that acpi=force
1465 + * works for you, please contact acpi-devel@sourceforge.net
1466 + */
1467 +static struct dmi_system_id __initdata acpi_dmi_table[] = {
1468 +       /*
1469 +        * Boxes that need ACPI disabled
1470 +        */
1471 +       {
1472 +        .callback = dmi_disable_acpi,
1473 +        .ident = "IBM Thinkpad",
1474 +        .matches = {
1475 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1476 +                    DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
1477 +                    },
1478 +        },
1479 +
1480 +       /*
1481 +        * Boxes that need acpi=ht
1482 +        */
1483 +       {
1484 +        .callback = force_acpi_ht,
1485 +        .ident = "FSC Primergy T850",
1486 +        .matches = {
1487 +                    DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1488 +                    DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1489 +                    },
1490 +        },
1491 +       {
1492 +        .callback = force_acpi_ht,
1493 +        .ident = "DELL GX240",
1494 +        .matches = {
1495 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
1496 +                    DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
1497 +                    },
1498 +        },
1499 +       {
1500 +        .callback = force_acpi_ht,
1501 +        .ident = "HP VISUALIZE NT Workstation",
1502 +        .matches = {
1503 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1504 +                    DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1505 +                    },
1506 +        },
1507 +       {
1508 +        .callback = force_acpi_ht,
1509 +        .ident = "Compaq Workstation W8000",
1510 +        .matches = {
1511 +                    DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1512 +                    DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1513 +                    },
1514 +        },
1515 +       {
1516 +        .callback = force_acpi_ht,
1517 +        .ident = "ASUS P4B266",
1518 +        .matches = {
1519 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1520 +                    DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1521 +                    },
1522 +        },
1523 +       {
1524 +        .callback = force_acpi_ht,
1525 +        .ident = "ASUS P2B-DS",
1526 +        .matches = {
1527 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1528 +                    DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1529 +                    },
1530 +        },
1531 +       {
1532 +        .callback = force_acpi_ht,
1533 +        .ident = "ASUS CUR-DLS",
1534 +        .matches = {
1535 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1536 +                    DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1537 +                    },
1538 +        },
1539 +       {
1540 +        .callback = force_acpi_ht,
1541 +        .ident = "ABIT i440BX-W83977",
1542 +        .matches = {
1543 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1544 +                    DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1545 +                    },
1546 +        },
1547 +       {
1548 +        .callback = force_acpi_ht,
1549 +        .ident = "IBM Bladecenter",
1550 +        .matches = {
1551 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1552 +                    DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1553 +                    },
1554 +        },
1555 +       {
1556 +        .callback = force_acpi_ht,
1557 +        .ident = "IBM eServer xSeries 360",
1558 +        .matches = {
1559 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1560 +                    DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1561 +                    },
1562 +        },
1563 +       {
1564 +        .callback = force_acpi_ht,
1565 +        .ident = "IBM eserver xSeries 330",
1566 +        .matches = {
1567 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1568 +                    DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1569 +                    },
1570 +        },
1571 +       {
1572 +        .callback = force_acpi_ht,
1573 +        .ident = "IBM eserver xSeries 440",
1574 +        .matches = {
1575 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1576 +                    DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1577 +                    },
1578 +        },
1579 +
1580 +       /*
1581 +        * Boxes that need ACPI PCI IRQ routing disabled
1582 +        */
1583 +       {
1584 +        .callback = disable_acpi_irq,
1585 +        .ident = "ASUS A7V",
1586 +        .matches = {
1587 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
1588 +                    DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
1589 +                    /* newer BIOS, Revision 1011, does work */
1590 +                    DMI_MATCH(DMI_BIOS_VERSION,
1591 +                              "ASUS A7V ACPI BIOS Revision 1007"),
1592 +                    },
1593 +        },
1594 +
1595 +       /*
1596 +        * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
1597 +        */
1598 +       {                       /* _BBN 0 bug */
1599 +        .callback = disable_acpi_pci,
1600 +        .ident = "ASUS PR-DLS",
1601 +        .matches = {
1602 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1603 +                    DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
1604 +                    DMI_MATCH(DMI_BIOS_VERSION,
1605 +                              "ASUS PR-DLS ACPI BIOS Revision 1010"),
1606 +                    DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
1607 +                    },
1608 +        },
1609 +       {
1610 +        .callback = disable_acpi_pci,
1611 +        .ident = "Acer TravelMate 36x Laptop",
1612 +        .matches = {
1613 +                    DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1614 +                    DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1615 +                    },
1616 +        },
1617 +       {}
1618 +};
1619 +
1620 +#endif                         /* __i386__ */
1621 +
1622 +/*
1623 + * acpi_boot_table_init() and acpi_boot_init()
1624 + *  called from setup_arch(), always.
1625 + *     1. checksums all tables
1626 + *     2. enumerates lapics
1627 + *     3. enumerates io-apics
1628 + *
1629 + * acpi_table_init() is separate to allow reading SRAT without
1630 + * other side effects.
1631 + *
1632 + * side effects of acpi_boot_init:
1633 + *     acpi_lapic = 1 if LAPIC found
1634 + *     acpi_ioapic = 1 if IOAPIC found
1635 + *     if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
1636 + *     if acpi_blacklisted() acpi_disabled = 1;
1637 + *     acpi_irq_model=...
1638 + *     ...
1639 + *
1640 + * return value: (currently ignored)
1641 + *     0: success
1642 + *     !0: failure
1643 + */
1644 +
1645 +int __init acpi_boot_table_init(void)
1646 +{
1647 +       int error;
1648 +
1649 +#ifdef __i386__
1650 +       dmi_check_system(acpi_dmi_table);
1651 +#endif
1652 +
1653 +       /*
1654 +        * If acpi_disabled, bail out
1655 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1656 +        */
1657 +       if (acpi_disabled && !acpi_ht)
1658 +               return 1;
1659 +
1660 +       /* 
1661 +        * Initialize the ACPI boot-time table parser.
1662 +        */
1663 +       error = acpi_table_init();
1664 +       if (error) {
1665 +               disable_acpi();
1666 +               return error;
1667 +       }
1668 +
1669 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1670 +
1671 +       /*
1672 +        * blacklist may disable ACPI entirely
1673 +        */
1674 +       error = acpi_blacklisted();
1675 +       if (error) {
1676 +               if (acpi_force) {
1677 +                       printk(KERN_WARNING PREFIX "acpi=force override\n");
1678 +               } else {
1679 +                       printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1680 +                       disable_acpi();
1681 +                       return error;
1682 +               }
1683 +       }
1684 +
1685 +       return 0;
1686 +}
1687 +
1688 +int __init acpi_boot_init(void)
1689 +{
1690 +       /*
1691 +        * If acpi_disabled, bail out
1692 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1693 +        */
1694 +       if (acpi_disabled && !acpi_ht)
1695 +               return 1;
1696 +
1697 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1698 +
1699 +       /*
1700 +        * set sci_int and PM timer address
1701 +        */
1702 +       acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
1703 +
1704 +       /*
1705 +        * Process the Multiple APIC Description Table (MADT), if present
1706 +        */
1707 +       acpi_process_madt();
1708 +
1709 +       acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
1710 +
1711 +       return 0;
1712 +}
1713 +
1714 +static int __init parse_acpi(char *arg)
1715 +{
1716 +       if (!arg)
1717 +               return -EINVAL;
1718 +
1719 +       /* "acpi=off" disables both ACPI table parsing and interpreter */
1720 +       if (strcmp(arg, "off") == 0) {
1721 +               disable_acpi();
1722 +       }
1723 +       /* acpi=force to over-ride black-list */
1724 +       else if (strcmp(arg, "force") == 0) {
1725 +               acpi_force = 1;
1726 +               acpi_ht = 1;
1727 +               acpi_disabled = 0;
1728 +       }
1729 +       /* acpi=strict disables out-of-spec workarounds */
1730 +       else if (strcmp(arg, "strict") == 0) {
1731 +               acpi_strict = 1;
1732 +       }
1733 +       /* Limit ACPI just to boot-time to enable HT */
1734 +       else if (strcmp(arg, "ht") == 0) {
1735 +               if (!acpi_force)
1736 +                       disable_acpi();
1737 +               acpi_ht = 1;
1738 +       }
1739 +       /* "acpi=noirq" disables ACPI interrupt routing */
1740 +       else if (strcmp(arg, "noirq") == 0) {
1741 +               acpi_noirq_set();
1742 +       } else {
1743 +               /* Core will printk when we return error. */
1744 +               return -EINVAL;
1745 +       }
1746 +       return 0;
1747 +}
1748 +early_param("acpi", parse_acpi);
1749 +
1750 +/* FIXME: Using pci= for an ACPI parameter is a travesty. */
1751 +static int __init parse_pci(char *arg)
1752 +{
1753 +       if (arg && strcmp(arg, "noacpi") == 0)
1754 +               acpi_disable_pci();
1755 +       return 0;
1756 +}
1757 +early_param("pci", parse_pci);
1758 +
1759 +#ifdef CONFIG_X86_IO_APIC
1760 +static int __init parse_acpi_skip_timer_override(char *arg)
1761 +{
1762 +       acpi_skip_timer_override = 1;
1763 +       return 0;
1764 +}
1765 +early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
1766 +
1767 +static int __init parse_acpi_use_timer_override(char *arg)
1768 +{
1769 +       acpi_use_timer_override = 1;
1770 +       return 0;
1771 +}
1772 +early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
1773 +#endif /* CONFIG_X86_IO_APIC */
1774 +
1775 +static int __init setup_acpi_sci(char *s)
1776 +{
1777 +       if (!s)
1778 +               return -EINVAL;
1779 +       if (!strcmp(s, "edge"))
1780 +               acpi_sci_flags.trigger = 1;
1781 +       else if (!strcmp(s, "level"))
1782 +               acpi_sci_flags.trigger = 3;
1783 +       else if (!strcmp(s, "high"))
1784 +               acpi_sci_flags.polarity = 1;
1785 +       else if (!strcmp(s, "low"))
1786 +               acpi_sci_flags.polarity = 3;
1787 +       else
1788 +               return -EINVAL;
1789 +       return 0;
1790 +}
1791 +early_param("acpi_sci", setup_acpi_sci);
1792 diff -ruNp linux-2.6.19/arch/i386/kernel/alternative.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/alternative.c
1793 --- linux-2.6.19/arch/i386/kernel/alternative.c 2006-11-29 21:57:37.000000000 +0000
1794 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/alternative.c       2007-02-02 19:10:21.000000000 +0000
1795 @@ -4,7 +4,11 @@
1796  #include <asm/alternative.h>
1797  #include <asm/sections.h>
1798  
1799 +#ifdef CONFIG_X86_64_XEN
1800 +static int no_replacement    = 1;
1801 +#else
1802  static int no_replacement    = 0;
1803 +#endif
1804  static int smp_alt_once      = 0;
1805  static int debug_alternative = 0;
1806  
1807 @@ -151,7 +155,11 @@ void apply_alternatives(struct alt_instr
1808  #ifdef CONFIG_X86_64
1809                 /* vsyscall code is not mapped yet. resolve it manually. */
1810                 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
1811 +#ifdef CONFIG_XEN
1812 +                       instr = __va(instr - (u8*)VSYSCALL_START + (u8*)phys_to_machine(__pa_symbol(&__vsyscall_0)));
1813 +#else
1814                         instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
1815 +#endif
1816                         DPRINTK("%s: vsyscall fixup: %p => %p\n",
1817                                 __FUNCTION__, a->instr, instr);
1818                 }
1819 @@ -347,9 +355,12 @@ void __init alternative_instructions(voi
1820         unsigned long flags;
1821         if (no_replacement) {
1822                 printk(KERN_INFO "(SMP-)alternatives turned off\n");
1823 +#ifndef CONFIG_X86_64
1824 +/* ToDo: x86_64 put something strange there, not sure what yet */
1825                 free_init_pages("SMP alternatives",
1826                                 (unsigned long)__smp_alt_begin,
1827                                 (unsigned long)__smp_alt_end);
1828 +#endif
1829                 return;
1830         }
1831  
1832 diff -ruNp linux-2.6.19/arch/i386/kernel/apic-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/apic-xen.c
1833 --- linux-2.6.19/arch/i386/kernel/apic-xen.c    1970-01-01 00:00:00.000000000 +0000
1834 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/apic-xen.c  2007-02-02 19:10:21.000000000 +0000
1835 @@ -0,0 +1,223 @@
1836 +/*
1837 + *     Local APIC handling, local APIC timers
1838 + *
1839 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
1840 + *
1841 + *     Fixes
1842 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
1843 + *                                     thanks to Eric Gilmore
1844 + *                                     and Rolf G. Tews
1845 + *                                     for testing these extensively.
1846 + *     Maciej W. Rozycki       :       Various updates and fixes.
1847 + *     Mikael Pettersson       :       Power Management for UP-APIC.
1848 + *     Pavel Machek and
1849 + *     Mikael Pettersson       :       PM converted to driver model.
1850 + */
1851 +
1852 +#include <linux/init.h>
1853 +
1854 +#include <linux/mm.h>
1855 +#include <linux/delay.h>
1856 +#include <linux/bootmem.h>
1857 +#include <linux/smp_lock.h>
1858 +#include <linux/interrupt.h>
1859 +#include <linux/mc146818rtc.h>
1860 +#include <linux/kernel_stat.h>
1861 +#include <linux/sysdev.h>
1862 +#include <linux/cpu.h>
1863 +#include <linux/module.h>
1864 +
1865 +#include <asm/atomic.h>
1866 +#include <asm/smp.h>
1867 +#include <asm/mtrr.h>
1868 +#include <asm/mpspec.h>
1869 +#include <asm/desc.h>
1870 +#include <asm/arch_hooks.h>
1871 +#include <asm/hpet.h>
1872 +#include <asm/i8253.h>
1873 +#include <asm/nmi.h>
1874 +
1875 +#include <mach_apic.h>
1876 +#include <mach_apicdef.h>
1877 +#include <mach_ipi.h>
1878 +
1879 +#include "io_ports.h"
1880 +
1881 +#ifndef CONFIG_XEN
1882 +/*
1883 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
1884 + * IPIs in place of local APIC timers
1885 + */
1886 +static cpumask_t timer_bcast_ipi;
1887 +#endif
1888 +
1889 +/*
1890 + * Knob to control our willingness to enable the local APIC.
1891 + */
1892 +static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
1893 +
1894 +static inline void lapic_disable(void)
1895 +{
1896 +       enable_local_apic = -1;
1897 +       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1898 +}
1899 +
1900 +static inline void lapic_enable(void)
1901 +{
1902 +       enable_local_apic = 1;
1903 +}
1904 +
1905 +/*
1906 + * Debug level
1907 + */
1908 +int apic_verbosity;
1909 +
1910 +static int modern_apic(void)
1911 +{
1912 +#ifndef CONFIG_XEN
1913 +       unsigned int lvr, version;
1914 +       /* AMD systems use old APIC versions, so check the CPU */
1915 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
1916 +               boot_cpu_data.x86 >= 0xf)
1917 +               return 1;
1918 +       lvr = apic_read(APIC_LVR);
1919 +       version = GET_APIC_VERSION(lvr);
1920 +       return version >= 0x14;
1921 +#else
1922 +       return 1;
1923 +#endif
1924 +}
1925 +
1926 +/*
1927 + * 'what should we do if we get a hw irq event on an illegal vector'.
1928 + * each architecture has to answer this themselves.
1929 + */
1930 +void ack_bad_irq(unsigned int irq)
1931 +{
1932 +       printk("unexpected IRQ trap at vector %02x\n", irq);
1933 +       /*
1934 +        * Currently unexpected vectors happen only on SMP and APIC.
1935 +        * We _must_ ack these because every local APIC has only N
1936 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
1937 +        * holds up an irq slot - in excessive cases (when multiple
1938 +        * unexpected vectors occur) that might lock up the APIC
1939 +        * completely.
1940 +        * But only ack when the APIC is enabled -AK
1941 +        */
1942 +       if (cpu_has_apic)
1943 +               ack_APIC_irq();
1944 +}
1945 +
1946 +#ifndef CONFIG_XEN
1947 +void __init apic_intr_init(void)
1948 +{
1949 +#ifdef CONFIG_SMP
1950 +       smp_intr_init();
1951 +#endif
1952 +       /* self generated IPI for local APIC timer */
1953 +       set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1954 +
1955 +       /* IPI vectors for APIC spurious and error interrupts */
1956 +       set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1957 +       set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1958 +
1959 +       /* thermal monitor LVT interrupt */
1960 +#ifdef CONFIG_X86_MCE_P4THERMAL
1961 +       set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1962 +#endif
1963 +}
1964 +
1965 +/* Using APIC to generate smp_local_timer_interrupt? */
1966 +int using_apic_timer __read_mostly = 0;
1967 +
1968 +static int enabled_via_apicbase;
1969 +
1970 +void enable_NMI_through_LVT0 (void * dummy)
1971 +{
1972 +       unsigned int v, ver;
1973 +
1974 +       ver = apic_read(APIC_LVR);
1975 +       ver = GET_APIC_VERSION(ver);
1976 +       v = APIC_DM_NMI;                        /* unmask and set to NMI */
1977 +       if (!APIC_INTEGRATED(ver))              /* 82489DX */
1978 +               v |= APIC_LVT_LEVEL_TRIGGER;
1979 +       apic_write_around(APIC_LVT0, v);
1980 +}
1981 +#endif /* !CONFIG_XEN */
1982 +
1983 +int get_physical_broadcast(void)
1984 +{
1985 +       if (modern_apic())
1986 +               return 0xff;
1987 +       else
1988 +               return 0xf;
1989 +}
1990 +
1991 +#ifndef CONFIG_XEN
1992 +#ifndef CONFIG_SMP
1993 +static void up_apic_timer_interrupt_call(void)
1994 +{
1995 +       int cpu = smp_processor_id();
1996 +
1997 +       /*
1998 +        * the NMI deadlock-detector uses this.
1999 +        */
2000 +       per_cpu(irq_stat, cpu).apic_timer_irqs++;
2001 +
2002 +       smp_local_timer_interrupt();
2003 +}
2004 +#endif
2005 +
2006 +void smp_send_timer_broadcast_ipi(void)
2007 +{
2008 +       cpumask_t mask;
2009 +
2010 +       cpus_and(mask, cpu_online_map, timer_bcast_ipi);
2011 +       if (!cpus_empty(mask)) {
2012 +#ifdef CONFIG_SMP
2013 +               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
2014 +#else
2015 +               /*
2016 +                * We can directly call the apic timer interrupt handler
2017 +                * in UP case. Minus all irq related functions
2018 +                */
2019 +               up_apic_timer_interrupt_call();
2020 +#endif
2021 +       }
2022 +}
2023 +#endif
2024 +
2025 +int setup_profiling_timer(unsigned int multiplier)
2026 +{
2027 +       return -EINVAL;
2028 +}
2029 +
2030 +/*
2031 + * This initializes the IO-APIC and APIC hardware if this is
2032 + * a UP kernel.
2033 + */
2034 +int __init APIC_init_uniprocessor (void)
2035 +{
2036 +#ifdef CONFIG_X86_IO_APIC
2037 +       if (smp_found_config)
2038 +               if (!skip_ioapic_setup && nr_ioapics)
2039 +                       setup_IO_APIC();
2040 +#endif
2041 +
2042 +       return 0;
2043 +}
2044 +
2045 +static int __init parse_lapic(char *arg)
2046 +{
2047 +       lapic_enable();
2048 +       return 0;
2049 +}
2050 +early_param("lapic", parse_lapic);
2051 +
2052 +static int __init parse_nolapic(char *arg)
2053 +{
2054 +       lapic_disable();
2055 +       return 0;
2056 +}
2057 +early_param("nolapic", parse_nolapic);
2058 +
2059 diff -ruNp linux-2.6.19/arch/i386/kernel/asm-offsets.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/asm-offsets.c
2060 --- linux-2.6.19/arch/i386/kernel/asm-offsets.c 2006-11-29 21:57:37.000000000 +0000
2061 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/asm-offsets.c       2007-02-02 19:10:21.000000000 +0000
2062 @@ -66,9 +66,14 @@ void foo(void)
2063         OFFSET(pbe_orig_address, pbe, orig_address);
2064         OFFSET(pbe_next, pbe, next);
2065  
2066 +#ifndef CONFIG_X86_NO_TSS
2067         /* Offset from the sysenter stack to tss.esp0 */
2068 -       DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
2069 +       DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) -
2070                  sizeof(struct tss_struct));
2071 +#else
2072 +       /* sysenter stack points directly to esp0 */
2073 +       DEFINE(SYSENTER_stack_esp0, 0);
2074 +#endif
2075  
2076         DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
2077         DEFINE(VDSO_PRELINK, VDSO_PRELINK);
2078 diff -ruNp linux-2.6.19/arch/i386/kernel/cpu/Makefile linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/Makefile
2079 --- linux-2.6.19/arch/i386/kernel/cpu/Makefile  2006-11-29 21:57:37.000000000 +0000
2080 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/Makefile        2007-02-02 19:10:21.000000000 +0000
2081 @@ -17,3 +17,8 @@ obj-$(CONFIG_X86_MCE) +=      mcheck/
2082  
2083  obj-$(CONFIG_MTRR)     +=      mtrr/
2084  obj-$(CONFIG_CPU_FREQ) +=      cpufreq/
2085 +
2086 +ifdef CONFIG_XEN
2087 +include $(srctree)/scripts/Makefile.xen
2088 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
2089 +endif
2090 diff -ruNp linux-2.6.19/arch/i386/kernel/cpu/common-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/common-xen.c
2091 --- linux-2.6.19/arch/i386/kernel/cpu/common-xen.c      1970-01-01 00:00:00.000000000 +0000
2092 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/common-xen.c    2007-02-02 19:10:21.000000000 +0000
2093 @@ -0,0 +1,747 @@
2094 +#include <linux/init.h>
2095 +#include <linux/string.h>
2096 +#include <linux/delay.h>
2097 +#include <linux/smp.h>
2098 +#include <linux/module.h>
2099 +#include <linux/percpu.h>
2100 +#include <linux/bootmem.h>
2101 +#include <asm/semaphore.h>
2102 +#include <asm/processor.h>
2103 +#include <asm/i387.h>
2104 +#include <asm/msr.h>
2105 +#include <asm/io.h>
2106 +#include <asm/mmu_context.h>
2107 +#include <asm/mtrr.h>
2108 +#include <asm/mce.h>
2109 +#ifdef CONFIG_X86_LOCAL_APIC
2110 +#include <asm/mpspec.h>
2111 +#include <asm/apic.h>
2112 +#include <mach_apic.h>
2113 +#endif
2114 +#include <asm/hypervisor.h>
2115 +
2116 +#include "cpu.h"
2117 +
2118 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
2119 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
2120 +
2121 +#ifndef CONFIG_XEN
2122 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
2123 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
2124 +#endif
2125 +
2126 +static int cachesize_override __cpuinitdata = -1;
2127 +static int disable_x86_fxsr __cpuinitdata;
2128 +static int disable_x86_serial_nr __cpuinitdata = 1;
2129 +static int disable_x86_sep __cpuinitdata;
2130 +
2131 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
2132 +
2133 +extern int disable_pse;
2134 +
2135 +static void __cpuinit default_init(struct cpuinfo_x86 * c)
2136 +{
2137 +       /* Not much we can do here... */
2138 +       /* Check if at least it has cpuid */
2139 +       if (c->cpuid_level == -1) {
2140 +               /* No cpuid. It must be an ancient CPU */
2141 +               if (c->x86 == 4)
2142 +                       strcpy(c->x86_model_id, "486");
2143 +               else if (c->x86 == 3)
2144 +                       strcpy(c->x86_model_id, "386");
2145 +       }
2146 +}
2147 +
2148 +static struct cpu_dev __cpuinitdata default_cpu = {
2149 +       .c_init = default_init,
2150 +       .c_vendor = "Unknown",
2151 +};
2152 +static struct cpu_dev * this_cpu = &default_cpu;
2153 +
2154 +static int __init cachesize_setup(char *str)
2155 +{
2156 +       get_option (&str, &cachesize_override);
2157 +       return 1;
2158 +}
2159 +__setup("cachesize=", cachesize_setup);
2160 +
2161 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
2162 +{
2163 +       unsigned int *v;
2164 +       char *p, *q;
2165 +
2166 +       if (cpuid_eax(0x80000000) < 0x80000004)
2167 +               return 0;
2168 +
2169 +       v = (unsigned int *) c->x86_model_id;
2170 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
2171 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
2172 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
2173 +       c->x86_model_id[48] = 0;
2174 +
2175 +       /* Intel chips right-justify this string for some dumb reason;
2176 +          undo that brain damage */
2177 +       p = q = &c->x86_model_id[0];
2178 +       while ( *p == ' ' )
2179 +            p++;
2180 +       if ( p != q ) {
2181 +            while ( *p )
2182 +                 *q++ = *p++;
2183 +            while ( q <= &c->x86_model_id[48] )
2184 +                 *q++ = '\0';  /* Zero-pad the rest */
2185 +       }
2186 +
2187 +       return 1;
2188 +}
2189 +
2190 +
2191 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
2192 +{
2193 +       unsigned int n, dummy, ecx, edx, l2size;
2194 +
2195 +       n = cpuid_eax(0x80000000);
2196 +
2197 +       if (n >= 0x80000005) {
2198 +               cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
2199 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
2200 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
2201 +               c->x86_cache_size=(ecx>>24)+(edx>>24);  
2202 +       }
2203 +
2204 +       if (n < 0x80000006)     /* Some chips just has a large L1. */
2205 +               return;
2206 +
2207 +       ecx = cpuid_ecx(0x80000006);
2208 +       l2size = ecx >> 16;
2209 +       
2210 +       /* do processor-specific cache resizing */
2211 +       if (this_cpu->c_size_cache)
2212 +               l2size = this_cpu->c_size_cache(c,l2size);
2213 +
2214 +       /* Allow user to override all this if necessary. */
2215 +       if (cachesize_override != -1)
2216 +               l2size = cachesize_override;
2217 +
2218 +       if ( l2size == 0 )
2219 +               return;         /* Again, no L2 cache is possible */
2220 +
2221 +       c->x86_cache_size = l2size;
2222 +
2223 +       printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
2224 +              l2size, ecx & 0xFF);
2225 +}
2226 +
2227 +/* Naming convention should be: <Name> [(<Codename>)] */
2228 +/* This table only is used unless init_<vendor>() below doesn't set it; */
2229 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
2230 +
2231 +/* Look up CPU names by table lookup. */
2232 +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
2233 +{
2234 +       struct cpu_model_info *info;
2235 +
2236 +       if ( c->x86_model >= 16 )
2237 +               return NULL;    /* Range check */
2238 +
2239 +       if (!this_cpu)
2240 +               return NULL;
2241 +
2242 +       info = this_cpu->c_models;
2243 +
2244 +       while (info && info->family) {
2245 +               if (info->family == c->x86)
2246 +                       return info->model_names[c->x86_model];
2247 +               info++;
2248 +       }
2249 +       return NULL;            /* Not found */
2250 +}
2251 +
2252 +
2253 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
2254 +{
2255 +       char *v = c->x86_vendor_id;
2256 +       int i;
2257 +       static int printed;
2258 +
2259 +       for (i = 0; i < X86_VENDOR_NUM; i++) {
2260 +               if (cpu_devs[i]) {
2261 +                       if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
2262 +                           (cpu_devs[i]->c_ident[1] && 
2263 +                            !strcmp(v,cpu_devs[i]->c_ident[1]))) {
2264 +                               c->x86_vendor = i;
2265 +                               if (!early)
2266 +                                       this_cpu = cpu_devs[i];
2267 +                               return;
2268 +                       }
2269 +               }
2270 +       }
2271 +       if (!printed) {
2272 +               printed++;
2273 +               printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
2274 +               printk(KERN_ERR "CPU: Your system may be unstable.\n");
2275 +       }
2276 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
2277 +       this_cpu = &default_cpu;
2278 +}
2279 +
2280 +
2281 +static int __init x86_fxsr_setup(char * s)
2282 +{
2283 +       /* Tell all the other CPU's to not use it... */
2284 +       disable_x86_fxsr = 1;
2285 +
2286 +       /*
2287 +        * ... and clear the bits early in the boot_cpu_data
2288 +        * so that the bootup process doesn't try to do this
2289 +        * either.
2290 +        */
2291 +       clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
2292 +       clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
2293 +       return 1;
2294 +}
2295 +__setup("nofxsr", x86_fxsr_setup);
2296 +
2297 +
2298 +static int __init x86_sep_setup(char * s)
2299 +{
2300 +       disable_x86_sep = 1;
2301 +       return 1;
2302 +}
2303 +__setup("nosep", x86_sep_setup);
2304 +
2305 +
2306 +/* Standard macro to see if a specific flag is changeable */
2307 +static inline int flag_is_changeable_p(u32 flag)
2308 +{
2309 +       u32 f1, f2;
2310 +
2311 +       asm("pushfl\n\t"
2312 +           "pushfl\n\t"
2313 +           "popl %0\n\t"
2314 +           "movl %0,%1\n\t"
2315 +           "xorl %2,%0\n\t"
2316 +           "pushl %0\n\t"
2317 +           "popfl\n\t"
2318 +           "pushfl\n\t"
2319 +           "popl %0\n\t"
2320 +           "popfl\n\t"
2321 +           : "=&r" (f1), "=&r" (f2)
2322 +           : "ir" (flag));
2323 +
2324 +       return ((f1^f2) & flag) != 0;
2325 +}
2326 +
2327 +
2328 +/* Probe for the CPUID instruction */
2329 +static int __cpuinit have_cpuid_p(void)
2330 +{
2331 +       return flag_is_changeable_p(X86_EFLAGS_ID);
2332 +}
2333 +
2334 +/* Do minimum CPU detection early.
2335 +   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
2336 +   The others are not touched to avoid unwanted side effects.
2337 +
2338 +   WARNING: this function is only called on the BP.  Don't add code here
2339 +   that is supposed to run on all CPUs. */
2340 +static void __init early_cpu_detect(void)
2341 +{
2342 +       struct cpuinfo_x86 *c = &boot_cpu_data;
2343 +
2344 +       c->x86_cache_alignment = 32;
2345 +
2346 +       if (!have_cpuid_p())
2347 +               return;
2348 +
2349 +       /* Get vendor name */
2350 +       cpuid(0x00000000, &c->cpuid_level,
2351 +             (int *)&c->x86_vendor_id[0],
2352 +             (int *)&c->x86_vendor_id[8],
2353 +             (int *)&c->x86_vendor_id[4]);
2354 +
2355 +       get_cpu_vendor(c, 1);
2356 +
2357 +       c->x86 = 4;
2358 +       if (c->cpuid_level >= 0x00000001) {
2359 +               u32 junk, tfms, cap0, misc;
2360 +               cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
2361 +               c->x86 = (tfms >> 8) & 15;
2362 +               c->x86_model = (tfms >> 4) & 15;
2363 +               if (c->x86 == 0xf)
2364 +                       c->x86 += (tfms >> 20) & 0xff;
2365 +               if (c->x86 >= 0x6)
2366 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
2367 +               c->x86_mask = tfms & 15;
2368 +               if (cap0 & (1<<19))
2369 +                       c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
2370 +       }
2371 +}
2372 +
2373 +static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
2374 +{
2375 +       u32 tfms, xlvl;
2376 +       int ebx;
2377 +
2378 +       if (have_cpuid_p()) {
2379 +               /* Get vendor name */
2380 +               cpuid(0x00000000, &c->cpuid_level,
2381 +                     (int *)&c->x86_vendor_id[0],
2382 +                     (int *)&c->x86_vendor_id[8],
2383 +                     (int *)&c->x86_vendor_id[4]);
2384 +               
2385 +               get_cpu_vendor(c, 0);
2386 +               /* Initialize the standard set of capabilities */
2387 +               /* Note that the vendor-specific code below might override */
2388 +       
2389 +               /* Intel-defined flags: level 0x00000001 */
2390 +               if ( c->cpuid_level >= 0x00000001 ) {
2391 +                       u32 capability, excap;
2392 +                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
2393 +                       c->x86_capability[0] = capability;
2394 +                       c->x86_capability[4] = excap;
2395 +                       c->x86 = (tfms >> 8) & 15;
2396 +                       c->x86_model = (tfms >> 4) & 15;
2397 +                       if (c->x86 == 0xf)
2398 +                               c->x86 += (tfms >> 20) & 0xff;
2399 +                       if (c->x86 >= 0x6)
2400 +                               c->x86_model += ((tfms >> 16) & 0xF) << 4;
2401 +                       c->x86_mask = tfms & 15;
2402 +#ifdef CONFIG_X86_HT
2403 +                       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
2404 +#else
2405 +                       c->apicid = (ebx >> 24) & 0xFF;
2406 +#endif
2407 +               } else {
2408 +                       /* Have CPUID level 0 only - unheard of */
2409 +                       c->x86 = 4;
2410 +               }
2411 +
2412 +               /* AMD-defined flags: level 0x80000001 */
2413 +               xlvl = cpuid_eax(0x80000000);
2414 +               if ( (xlvl & 0xffff0000) == 0x80000000 ) {
2415 +                       if ( xlvl >= 0x80000001 ) {
2416 +                               c->x86_capability[1] = cpuid_edx(0x80000001);
2417 +                               c->x86_capability[6] = cpuid_ecx(0x80000001);
2418 +                       }
2419 +                       if ( xlvl >= 0x80000004 )
2420 +                               get_model_name(c); /* Default name */
2421 +               }
2422 +       }
2423 +
2424 +       early_intel_workaround(c);
2425 +
2426 +#ifdef CONFIG_X86_HT
2427 +       c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
2428 +#endif
2429 +}
2430 +
2431 +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
2432 +{
2433 +       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
2434 +               /* Disable processor serial number */
2435 +               unsigned long lo,hi;
2436 +               rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2437 +               lo |= 0x200000;
2438 +               wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2439 +               printk(KERN_NOTICE "CPU serial number disabled.\n");
2440 +               clear_bit(X86_FEATURE_PN, c->x86_capability);
2441 +
2442 +               /* Disabling the serial number may affect the cpuid level */
2443 +               c->cpuid_level = cpuid_eax(0);
2444 +       }
2445 +}
2446 +
2447 +static int __init x86_serial_nr_setup(char *s)
2448 +{
2449 +       disable_x86_serial_nr = 0;
2450 +       return 1;
2451 +}
2452 +__setup("serialnumber", x86_serial_nr_setup);
2453 +
2454 +
2455 +
2456 +/*
2457 + * This does the hard work of actually picking apart the CPU stuff...
2458 + */
2459 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
2460 +{
2461 +       int i;
2462 +
2463 +       c->loops_per_jiffy = loops_per_jiffy;
2464 +       c->x86_cache_size = -1;
2465 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
2466 +       c->cpuid_level = -1;    /* CPUID not detected */
2467 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
2468 +       c->x86_vendor_id[0] = '\0'; /* Unset */
2469 +       c->x86_model_id[0] = '\0';  /* Unset */
2470 +       c->x86_max_cores = 1;
2471 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
2472 +
2473 +       if (!have_cpuid_p()) {
2474 +               /* First of all, decide if this is a 486 or higher */
2475 +               /* It's a 486 if we can modify the AC flag */
2476 +               if ( flag_is_changeable_p(X86_EFLAGS_AC) )
2477 +                       c->x86 = 4;
2478 +               else
2479 +                       c->x86 = 3;
2480 +       }
2481 +
2482 +       generic_identify(c);
2483 +
2484 +       printk(KERN_DEBUG "CPU: After generic identify, caps:");
2485 +       for (i = 0; i < NCAPINTS; i++)
2486 +               printk(" %08lx", c->x86_capability[i]);
2487 +       printk("\n");
2488 +
2489 +       if (this_cpu->c_identify) {
2490 +               this_cpu->c_identify(c);
2491 +
2492 +               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
2493 +               for (i = 0; i < NCAPINTS; i++)
2494 +                       printk(" %08lx", c->x86_capability[i]);
2495 +               printk("\n");
2496 +       }
2497 +
2498 +       /*
2499 +        * Vendor-specific initialization.  In this section we
2500 +        * canonicalize the feature flags, meaning if there are
2501 +        * features a certain CPU supports which CPUID doesn't
2502 +        * tell us, CPUID claiming incorrect flags, or other bugs,
2503 +        * we handle them here.
2504 +        *
2505 +        * At the end of this section, c->x86_capability better
2506 +        * indicate the features this CPU genuinely supports!
2507 +        */
2508 +       if (this_cpu->c_init)
2509 +               this_cpu->c_init(c);
2510 +
2511 +       /* Disable the PN if appropriate */
2512 +       squash_the_stupid_serial_number(c);
2513 +
2514 +       /*
2515 +        * The vendor-specific functions might have changed features.  Now
2516 +        * we do "generic changes."
2517 +        */
2518 +
2519 +       /* TSC disabled? */
2520 +       if ( tsc_disable )
2521 +               clear_bit(X86_FEATURE_TSC, c->x86_capability);
2522 +
2523 +       /* FXSR disabled? */
2524 +       if (disable_x86_fxsr) {
2525 +               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
2526 +               clear_bit(X86_FEATURE_XMM, c->x86_capability);
2527 +       }
2528 +
2529 +       /* SEP disabled? */
2530 +       if (disable_x86_sep)
2531 +               clear_bit(X86_FEATURE_SEP, c->x86_capability);
2532 +
2533 +       if (disable_pse)
2534 +               clear_bit(X86_FEATURE_PSE, c->x86_capability);
2535 +
2536 +       /* If the model name is still unset, do table lookup. */
2537 +       if ( !c->x86_model_id[0] ) {
2538 +               char *p;
2539 +               p = table_lookup_model(c);
2540 +               if ( p )
2541 +                       strcpy(c->x86_model_id, p);
2542 +               else
2543 +                       /* Last resort... */
2544 +                       sprintf(c->x86_model_id, "%02x/%02x",
2545 +                               c->x86, c->x86_model);
2546 +       }
2547 +
2548 +       /* Now the feature flags better reflect actual CPU features! */
2549 +
2550 +       printk(KERN_DEBUG "CPU: After all inits, caps:");
2551 +       for (i = 0; i < NCAPINTS; i++)
2552 +               printk(" %08lx", c->x86_capability[i]);
2553 +       printk("\n");
2554 +
2555 +       /*
2556 +        * On SMP, boot_cpu_data holds the common feature set between
2557 +        * all CPUs; so make sure that we indicate which features are
2558 +        * common between the CPUs.  The first time this routine gets
2559 +        * executed, c == &boot_cpu_data.
2560 +        */
2561 +       if ( c != &boot_cpu_data ) {
2562 +               /* AND the already accumulated flags with these */
2563 +               for ( i = 0 ; i < NCAPINTS ; i++ )
2564 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
2565 +       }
2566 +
2567 +       /* Init Machine Check Exception if available. */
2568 +       mcheck_init(c);
2569 +
2570 +       if (c == &boot_cpu_data)
2571 +               sysenter_setup();
2572 +       enable_sep_cpu();
2573 +
2574 +       if (c == &boot_cpu_data)
2575 +               mtrr_bp_init();
2576 +       else
2577 +               mtrr_ap_init();
2578 +}
2579 +
2580 +#ifdef CONFIG_X86_HT
2581 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
2582 +{
2583 +       u32     eax, ebx, ecx, edx;
2584 +       int     index_msb, core_bits;
2585 +
2586 +       cpuid(1, &eax, &ebx, &ecx, &edx);
2587 +
2588 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
2589 +               return;
2590 +
2591 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
2592 +
2593 +       if (smp_num_siblings == 1) {
2594 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
2595 +       } else if (smp_num_siblings > 1 ) {
2596 +
2597 +               if (smp_num_siblings > NR_CPUS) {
2598 +                       printk(KERN_WARNING "CPU: Unsupported number of the "
2599 +                                       "siblings %d", smp_num_siblings);
2600 +                       smp_num_siblings = 1;
2601 +                       return;
2602 +               }
2603 +
2604 +               index_msb = get_count_order(smp_num_siblings);
2605 +               c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
2606 +
2607 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
2608 +                      c->phys_proc_id);
2609 +
2610 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
2611 +
2612 +               index_msb = get_count_order(smp_num_siblings) ;
2613 +
2614 +               core_bits = get_count_order(c->x86_max_cores);
2615 +
2616 +               c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
2617 +                                              ((1 << core_bits) - 1);
2618 +
2619 +               if (c->x86_max_cores > 1)
2620 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
2621 +                              c->cpu_core_id);
2622 +       }
2623 +}
2624 +#endif
2625 +
2626 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
2627 +{
2628 +       char *vendor = NULL;
2629 +
2630 +       if (c->x86_vendor < X86_VENDOR_NUM)
2631 +               vendor = this_cpu->c_vendor;
2632 +       else if (c->cpuid_level >= 0)
2633 +               vendor = c->x86_vendor_id;
2634 +
2635 +       if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
2636 +               printk("%s ", vendor);
2637 +
2638 +       if (!c->x86_model_id[0])
2639 +               printk("%d86", c->x86);
2640 +       else
2641 +               printk("%s", c->x86_model_id);
2642 +
2643 +       if (c->x86_mask || c->cpuid_level >= 0) 
2644 +               printk(" stepping %02x\n", c->x86_mask);
2645 +       else
2646 +               printk("\n");
2647 +}
2648 +
2649 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
2650 +
2651 +/* This is hacky. :)
2652 + * We're emulating future behavior.
2653 + * In the future, the cpu-specific init functions will be called implicitly
2654 + * via the magic of initcalls.
2655 + * They will insert themselves into the cpu_devs structure.
2656 + * Then, when cpu_init() is called, we can just iterate over that array.
2657 + */
2658 +
2659 +extern int intel_cpu_init(void);
2660 +extern int cyrix_init_cpu(void);
2661 +extern int nsc_init_cpu(void);
2662 +extern int amd_init_cpu(void);
2663 +extern int centaur_init_cpu(void);
2664 +extern int transmeta_init_cpu(void);
2665 +extern int rise_init_cpu(void);
2666 +extern int nexgen_init_cpu(void);
2667 +extern int umc_init_cpu(void);
2668 +
2669 +void __init early_cpu_init(void)
2670 +{
2671 +       intel_cpu_init();
2672 +       cyrix_init_cpu();
2673 +       nsc_init_cpu();
2674 +       amd_init_cpu();
2675 +       centaur_init_cpu();
2676 +       transmeta_init_cpu();
2677 +       rise_init_cpu();
2678 +       nexgen_init_cpu();
2679 +       umc_init_cpu();
2680 +       early_cpu_detect();
2681 +
2682 +#ifdef CONFIG_DEBUG_PAGEALLOC
2683 +       /* pse is not compatible with on-the-fly unmapping,
2684 +        * disable it even if the cpus claim to support it.
2685 +        */
2686 +       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
2687 +       disable_pse = 1;
2688 +#endif
2689 +}
2690 +
2691 +void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
2692 +{
2693 +       unsigned long frames[16];
2694 +       unsigned long va;
2695 +       int f;
2696 +
2697 +       for (va = gdt_descr->address, f = 0;
2698 +            va < gdt_descr->address + gdt_descr->size;
2699 +            va += PAGE_SIZE, f++) {
2700 +               frames[f] = virt_to_mfn(va);
2701 +               make_lowmem_page_readonly(
2702 +                       (void *)va, XENFEAT_writable_descriptor_tables);
2703 +       }
2704 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
2705 +               BUG();
2706 +}
2707 +
2708 +/*
2709 + * cpu_init() initializes state that is per-CPU. Some data is already
2710 + * initialized (naturally) in the bootstrap process, such as the GDT
2711 + * and IDT. We reload them nevertheless, this function acts as a
2712 + * 'CPU state barrier', nothing should get across.
2713 + */
2714 +void __cpuinit cpu_init(void)
2715 +{
2716 +       int cpu = smp_processor_id();
2717 +#ifndef CONFIG_X86_NO_TSS
2718 +       struct tss_struct * t = &per_cpu(init_tss, cpu);
2719 +#endif
2720 +       struct thread_struct *thread = &current->thread;
2721 +       struct desc_struct *gdt;
2722 +       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2723 +
2724 +       if (cpu_test_and_set(cpu, cpu_initialized)) {
2725 +               printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
2726 +               for (;;) local_irq_enable();
2727 +       }
2728 +       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
2729 +
2730 +       if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
2731 +               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
2732 +       if (tsc_disable && cpu_has_tsc) {
2733 +               printk(KERN_NOTICE "Disabling TSC...\n");
2734 +               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
2735 +               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
2736 +               set_in_cr4(X86_CR4_TSD);
2737 +       }
2738 +
2739 +#ifndef CONFIG_XEN
2740 +       /* The CPU hotplug case */
2741 +       if (cpu_gdt_descr->address) {
2742 +               gdt = (struct desc_struct *)cpu_gdt_descr->address;
2743 +               memset(gdt, 0, PAGE_SIZE);
2744 +               goto old_gdt;
2745 +       }
2746 +       /*
2747 +        * This is a horrible hack to allocate the GDT.  The problem
2748 +        * is that cpu_init() is called really early for the boot CPU
2749 +        * (and hence needs bootmem) but much later for the secondary
2750 +        * CPUs, when bootmem will have gone away
2751 +        */
2752 +       if (NODE_DATA(0)->bdata->node_bootmem_map) {
2753 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2754 +               /* alloc_bootmem_pages panics on failure, so no check */
2755 +               memset(gdt, 0, PAGE_SIZE);
2756 +       } else {
2757 +               gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
2758 +               if (unlikely(!gdt)) {
2759 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
2760 +                       for (;;)
2761 +                               local_irq_enable();
2762 +               }
2763 +       }
2764 +old_gdt:
2765 +       /*
2766 +        * Initialize the per-CPU GDT with the boot GDT,
2767 +        * and set up the GDT descriptor:
2768 +        */
2769 +       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2770 +
2771 +       /* Set up GDT entry for 16bit stack */
2772 +       *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
2773 +               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
2774 +               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
2775 +               (CPU_16BIT_STACK_SIZE - 1);
2776 +
2777 +       cpu_gdt_descr->size = GDT_SIZE - 1;
2778 +       cpu_gdt_descr->address = (unsigned long)gdt;
2779 +#else
2780 +       if (cpu == 0 && cpu_gdt_descr->address == 0) {
2781 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2782 +               /* alloc_bootmem_pages panics on failure, so no check */
2783 +               memset(gdt, 0, PAGE_SIZE);
2784 +
2785 +               memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2786 +               
2787 +               cpu_gdt_descr->size = GDT_SIZE;
2788 +               cpu_gdt_descr->address = (unsigned long)gdt;
2789 +       }
2790 +#endif
2791 +
2792 +       cpu_gdt_init(cpu_gdt_descr);
2793 +
2794 +       /*
2795 +        * Set up and load the per-CPU TSS and LDT
2796 +        */
2797 +       atomic_inc(&init_mm.mm_count);
2798 +       current->active_mm = &init_mm;
2799 +       BUG_ON(current->mm);
2800 +       enter_lazy_tlb(&init_mm, current);
2801 +
2802 +       load_esp0(t, thread);
2803 +
2804 +       load_LDT(&init_mm.context);
2805 +
2806 +#ifdef CONFIG_DOUBLEFAULT
2807 +       /* Set up doublefault TSS pointer in the GDT */
2808 +       __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
2809 +#endif
2810 +
2811 +       /* Clear %fs and %gs. */
2812 +       asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
2813 +
2814 +       /* Clear all 6 debug registers: */
2815 +       set_debugreg(0, 0);
2816 +       set_debugreg(0, 1);
2817 +       set_debugreg(0, 2);
2818 +       set_debugreg(0, 3);
2819 +       set_debugreg(0, 6);
2820 +       set_debugreg(0, 7);
2821 +
2822 +       /*
2823 +        * Force FPU initialization:
2824 +        */
2825 +       current_thread_info()->status = 0;
2826 +       clear_used_math();
2827 +       mxcsr_feature_mask_init();
2828 +}
2829 +
2830 +#ifdef CONFIG_HOTPLUG_CPU
2831 +void __cpuinit cpu_uninit(void)
2832 +{
2833 +       int cpu = raw_smp_processor_id();
2834 +       cpu_clear(cpu, cpu_initialized);
2835 +
2836 +       /* lazy TLB state */
2837 +       per_cpu(cpu_tlbstate, cpu).state = 0;
2838 +       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
2839 +}
2840 +#endif
2841 diff -ruNp linux-2.6.19/arch/i386/kernel/cpu/mtrr/Makefile linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/mtrr/Makefile
2842 --- linux-2.6.19/arch/i386/kernel/cpu/mtrr/Makefile     2006-11-29 21:57:37.000000000 +0000
2843 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/mtrr/Makefile   2007-02-02 19:10:21.000000000 +0000
2844 @@ -3,3 +3,10 @@ obj-y          += amd.o
2845  obj-y          += cyrix.o
2846  obj-y          += centaur.o
2847  
2848 +ifdef CONFIG_XEN
2849 +include $(srctree)/scripts/Makefile.xen
2850 +n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o
2851 +
2852 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
2853 +obj-y := $(call cherrypickxen, $(obj-y))
2854 +endif
2855 diff -ruNp linux-2.6.19/arch/i386/kernel/cpu/mtrr/main-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/mtrr/main-xen.c
2856 --- linux-2.6.19/arch/i386/kernel/cpu/mtrr/main-xen.c   1970-01-01 00:00:00.000000000 +0000
2857 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/cpu/mtrr/main-xen.c 2007-02-02 19:10:21.000000000 +0000
2858 @@ -0,0 +1,198 @@
2859 +#include <linux/init.h>
2860 +#include <linux/proc_fs.h>
2861 +#include <linux/ctype.h>
2862 +#include <linux/module.h>
2863 +#include <linux/seq_file.h>
2864 +#include <linux/mutex.h>
2865 +#include <asm/uaccess.h>
2866 +
2867 +#include <asm/mtrr.h>
2868 +#include "mtrr.h"
2869 +
2870 +static DEFINE_MUTEX(mtrr_mutex);
2871 +
2872 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
2873 +                     unsigned int *size, mtrr_type * type)
2874 +{
2875 +       dom0_op_t op;
2876 +
2877 +       op.cmd = DOM0_READ_MEMTYPE;
2878 +       op.u.read_memtype.reg = reg;
2879 +       (void)HYPERVISOR_dom0_op(&op);
2880 +
2881 +       *size = op.u.read_memtype.nr_mfns;
2882 +       *base = op.u.read_memtype.mfn;
2883 +       *type = op.u.read_memtype.type;
2884 +}
2885 +
2886 +struct mtrr_ops generic_mtrr_ops = {
2887 +       .use_intel_if      = 1,
2888 +       .get               = generic_get_mtrr,
2889 +};
2890 +
2891 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
2892 +unsigned int num_var_ranges;
2893 +unsigned int *usage_table;
2894 +
2895 +/*  This function returns the number of variable MTRRs  */
2896 +static void __init set_num_var_ranges(void)
2897 +{
2898 +       dom0_op_t op;
2899 +
2900 +       for (num_var_ranges = 0; ; num_var_ranges++) {
2901 +               op.cmd = DOM0_READ_MEMTYPE;
2902 +               op.u.read_memtype.reg = num_var_ranges;
2903 +               if (HYPERVISOR_dom0_op(&op) != 0)
2904 +                       break;
2905 +       }
2906 +}
2907 +
2908 +static void __init init_table(void)
2909 +{
2910 +       int i, max;
2911 +
2912 +       max = num_var_ranges;
2913 +       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
2914 +           == NULL) {
2915 +               printk(KERN_ERR "mtrr: could not allocate\n");
2916 +               return;
2917 +       }
2918 +       for (i = 0; i < max; i++)
2919 +               usage_table[i] = 0;
2920 +}
2921 +
2922 +int mtrr_add_page(unsigned long base, unsigned long size, 
2923 +                 unsigned int type, char increment)
2924 +{
2925 +       int error;
2926 +       dom0_op_t op;
2927 +
2928 +       mutex_lock(&mtrr_mutex);
2929 +
2930 +       op.cmd = DOM0_ADD_MEMTYPE;
2931 +       op.u.add_memtype.mfn     = base;
2932 +       op.u.add_memtype.nr_mfns = size;
2933 +       op.u.add_memtype.type    = type;
2934 +       error = HYPERVISOR_dom0_op(&op);
2935 +       if (error) {
2936 +               mutex_unlock(&mtrr_mutex);
2937 +               BUG_ON(error > 0);
2938 +               return error;
2939 +       }
2940 +
2941 +       if (increment)
2942 +               ++usage_table[op.u.add_memtype.reg];
2943 +
2944 +       mutex_unlock(&mtrr_mutex);
2945 +
2946 +       return op.u.add_memtype.reg;
2947 +}
2948 +
2949 +static int mtrr_check(unsigned long base, unsigned long size)
2950 +{
2951 +       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
2952 +               printk(KERN_WARNING
2953 +                       "mtrr: size and base must be multiples of 4 kiB\n");
2954 +               printk(KERN_DEBUG
2955 +                       "mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
2956 +               dump_stack();
2957 +               return -1;
2958 +       }
2959 +       return 0;
2960 +}
2961 +
2962 +int
2963 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
2964 +        char increment)
2965 +{
2966 +       if (mtrr_check(base, size))
2967 +               return -EINVAL;
2968 +       return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
2969 +                            increment);
2970 +}
2971 +
2972 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
2973 +{
2974 +       unsigned i;
2975 +       mtrr_type ltype;
2976 +       unsigned long lbase;
2977 +       unsigned int lsize;
2978 +       int error = -EINVAL;
2979 +       dom0_op_t op;
2980 +
2981 +       mutex_lock(&mtrr_mutex);
2982 +
2983 +       if (reg < 0) {
2984 +               /*  Search for existing MTRR  */
2985 +               for (i = 0; i < num_var_ranges; ++i) {
2986 +                       mtrr_if->get(i, &lbase, &lsize, &ltype);
2987 +                       if (lbase == base && lsize == size) {
2988 +                               reg = i;
2989 +                               break;
2990 +                       }
2991 +               }
2992 +               if (reg < 0) {
2993 +                       printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
2994 +                              size);
2995 +                       goto out;
2996 +               }
2997 +       }
2998 +       if (usage_table[reg] < 1) {
2999 +               printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
3000 +               goto out;
3001 +       }
3002 +       if (--usage_table[reg] < 1) {
3003 +               op.cmd = DOM0_DEL_MEMTYPE;
3004 +               op.u.del_memtype.handle = 0;
3005 +               op.u.del_memtype.reg    = reg;
3006 +               error = HYPERVISOR_dom0_op(&op);
3007 +               if (error) {
3008 +                       BUG_ON(error > 0);
3009 +                       goto out;
3010 +               }
3011 +       }
3012 +       error = reg;
3013 + out:
3014 +       mutex_unlock(&mtrr_mutex);
3015 +       return error;
3016 +}
3017 +
3018 +int
3019 +mtrr_del(int reg, unsigned long base, unsigned long size)
3020 +{
3021 +       if (mtrr_check(base, size))
3022 +               return -EINVAL;
3023 +       return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
3024 +}
3025 +
3026 +EXPORT_SYMBOL(mtrr_add);
3027 +EXPORT_SYMBOL(mtrr_del);
3028 +
3029 +void __init mtrr_bp_init(void)
3030 +{
3031 +}
3032 +
3033 +void mtrr_ap_init(void)
3034 +{
3035 +}
3036 +
3037 +static int __init mtrr_init(void)
3038 +{
3039 +       struct cpuinfo_x86 *c = &boot_cpu_data;
3040 +
3041 +       if (!is_initial_xendomain())
3042 +               return -ENODEV;
3043 +
3044 +       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
3045 +           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
3046 +           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
3047 +           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
3048 +               return -ENODEV;
3049 +
3050 +       set_num_var_ranges();
3051 +       init_table();
3052 +
3053 +       return 0;
3054 +}
3055 +
3056 +subsys_initcall(mtrr_init);
3057 diff -ruNp linux-2.6.19/arch/i386/kernel/crash.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/crash.c
3058 --- linux-2.6.19/arch/i386/kernel/crash.c       2006-11-29 21:57:37.000000000 +0000
3059 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/crash.c     2007-02-02 19:10:21.000000000 +0000
3060 @@ -93,6 +93,7 @@ static void crash_save_self(struct pt_re
3061         crash_save_this_cpu(regs, cpu);
3062  }
3063  
3064 +#ifndef CONFIG_XEN
3065  #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
3066  static atomic_t waiting_for_crash_ipi;
3067  
3068 @@ -174,6 +175,7 @@ static void nmi_shootdown_cpus(void)
3069         /* There are no cpus to shootdown */
3070  }
3071  #endif
3072 +#endif /* CONFIG_XEN */
3073  
3074  void machine_crash_shutdown(struct pt_regs *regs)
3075  {
3076 @@ -190,10 +192,12 @@ void machine_crash_shutdown(struct pt_re
3077  
3078         /* Make a note of crashing cpu. Will be used in NMI callback.*/
3079         crashing_cpu = safe_smp_processor_id();
3080 +#ifndef CONFIG_XEN
3081         nmi_shootdown_cpus();
3082         lapic_shutdown();
3083  #if defined(CONFIG_X86_IO_APIC)
3084         disable_IO_APIC();
3085  #endif
3086 +#endif /* CONFIG_XEN */
3087         crash_save_self(regs);
3088  }
3089 diff -ruNp linux-2.6.19/arch/i386/kernel/early_printk-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/early_printk-xen.c
3090 --- linux-2.6.19/arch/i386/kernel/early_printk-xen.c    1970-01-01 00:00:00.000000000 +0000
3091 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/early_printk-xen.c  2007-02-02 19:10:21.000000000 +0000
3092 @@ -0,0 +1,2 @@
3093 +
3094 +#include "../../x86_64/kernel/early_printk-xen.c"
3095 diff -ruNp linux-2.6.19/arch/i386/kernel/entry-xen.S linux-2.6.19-xen-3.0.4/arch/i386/kernel/entry-xen.S
3096 --- linux-2.6.19/arch/i386/kernel/entry-xen.S   1970-01-01 00:00:00.000000000 +0000
3097 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/entry-xen.S 2007-02-02 19:10:21.000000000 +0000
3098 @@ -0,0 +1,1254 @@
3099 +/*
3100 + *  linux/arch/i386/entry.S
3101 + *
3102 + *  Copyright (C) 1991, 1992  Linus Torvalds
3103 + */
3104 +
3105 +/*
3106 + * entry.S contains the system-call and fault low-level handling routines.
3107 + * This also contains the timer-interrupt handler, as well as all interrupts
3108 + * and faults that can result in a task-switch.
3109 + *
3110 + * NOTE: This code handles signal-recognition, which happens every time
3111 + * after a timer-interrupt and after each system call.
3112 + *
3113 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
3114 + * on a 486.
3115 + *
3116 + * Stack layout in 'ret_from_system_call':
3117 + *     ptrace needs to have all regs on the stack.
3118 + *     if the order here is changed, it needs to be
3119 + *     updated in fork.c:copy_process, signal.c:do_signal,
3120 + *     ptrace.c and ptrace.h
3121 + *
3122 + *      0(%esp) - %ebx
3123 + *      4(%esp) - %ecx
3124 + *      8(%esp) - %edx
3125 + *       C(%esp) - %esi
3126 + *     10(%esp) - %edi
3127 + *     14(%esp) - %ebp
3128 + *     18(%esp) - %eax
3129 + *     1C(%esp) - %ds
3130 + *     20(%esp) - %es
3131 + *     24(%esp) - orig_eax
3132 + *     28(%esp) - %eip
3133 + *     2C(%esp) - %cs
3134 + *     30(%esp) - %eflags
3135 + *     34(%esp) - %oldesp
3136 + *     38(%esp) - %oldss
3137 + *
3138 + * "current" is in register %ebx during any slow entries.
3139 + */
3140 +
3141 +#include <linux/linkage.h>
3142 +#include <asm/thread_info.h>
3143 +#include <asm/irqflags.h>
3144 +#include <asm/errno.h>
3145 +#include <asm/segment.h>
3146 +#include <asm/smp.h>
3147 +#include <asm/page.h>
3148 +#include <asm/desc.h>
3149 +#include <asm/dwarf2.h>
3150 +#include "irq_vectors.h"
3151 +#include <xen/interface/xen.h>
3152 +
3153 +#define nr_syscalls ((syscall_table_size)/4)
3154 +
3155 +EBX            = 0x00
3156 +ECX            = 0x04
3157 +EDX            = 0x08
3158 +ESI            = 0x0C
3159 +EDI            = 0x10
3160 +EBP            = 0x14
3161 +EAX            = 0x18
3162 +DS             = 0x1C
3163 +ES             = 0x20
3164 +ORIG_EAX       = 0x24
3165 +EIP            = 0x28
3166 +CS             = 0x2C
3167 +EFLAGS         = 0x30
3168 +OLDESP         = 0x34
3169 +OLDSS          = 0x38
3170 +
3171 +CF_MASK                = 0x00000001
3172 +TF_MASK                = 0x00000100
3173 +IF_MASK                = 0x00000200
3174 +DF_MASK                = 0x00000400 
3175 +NT_MASK                = 0x00004000
3176 +VM_MASK                = 0x00020000
3177 +/* Pseudo-eflags. */
3178 +NMI_MASK       = 0x80000000
3179 +
3180 +#ifdef CONFIG_XEN
3181 +/* Offsets into shared_info_t. */
3182 +#define evtchn_upcall_pending          /* 0 */
3183 +#define evtchn_upcall_mask             1
3184 +
3185 +#define sizeof_vcpu_shift              6
3186 +
3187 +#ifdef CONFIG_SMP
3188 +#define GET_VCPU_INFO          movl TI_cpu(%ebp),%esi                  ; \
3189 +                               shl  $sizeof_vcpu_shift,%esi            ; \
3190 +                               addl HYPERVISOR_shared_info,%esi
3191 +#else
3192 +#define GET_VCPU_INFO          movl HYPERVISOR_shared_info,%esi
3193 +#endif
3194 +
3195 +#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(%esi)
3196 +#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(%esi)
3197 +#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(%esi)
3198 +#endif
3199 +
3200 +/* These are replaces for paravirtualization */
3201 +#ifdef CONFIG_XEN
3202 +#define DISABLE_INTERRUPTS             GET_VCPU_INFO                           ; \
3203 +                                       __DISABLE_INTERRUPTS
3204 +#define ENABLE_INTERRUPTS              GET_VCPU_INFO                           ; \
3205 +                                       __ENABLE_INTERRUPTS
3206 +#define ENABLE_INTERRUPTS_SYSEXIT      NET_DONE_YET
3207 +#define INTERRUPT_RETURN               iret
3208 +#define GET_CR0_INTO_EAX               NOT_DONE_YET
3209 +#else
3210 +#define DISABLE_INTERRUPTS             cli
3211 +#define ENABLE_INTERRUPTS              sti
3212 +#define ENABLE_INTERRUPTS_SYSEXIT      sti; sysexit
3213 +#define INTERRUPT_RETURN               iret
3214 +#define GET_CR0_INTO_EAX               movl %cr0, %eax
3215 +#endif /* !CONFIG_XEN */
3216 +
3217 +#ifdef CONFIG_PREEMPT
3218 +#define preempt_stop           DISABLE_INTERRUPTS; TRACE_IRQS_OFF
3219 +#else
3220 +#define preempt_stop
3221 +#define resume_kernel          restore_nocheck
3222 +#endif
3223 +
3224 +.macro TRACE_IRQS_IRET
3225 +#ifdef CONFIG_TRACE_IRQFLAGS
3226 +       testl $IF_MASK,EFLAGS(%esp)     # interrupts off?
3227 +       jz 1f
3228 +       TRACE_IRQS_ON
3229 +1:
3230 +#endif
3231 +.endm
3232 +
3233 +#ifdef CONFIG_VM86
3234 +#define resume_userspace_sig   check_userspace
3235 +#else
3236 +#define resume_userspace_sig   resume_userspace
3237 +#endif
3238 +
3239 +#define SAVE_ALL \
3240 +       cld; \
3241 +       pushl %es; \
3242 +       CFI_ADJUST_CFA_OFFSET 4;\
3243 +       /*CFI_REL_OFFSET es, 0;*/\
3244 +       pushl %ds; \
3245 +       CFI_ADJUST_CFA_OFFSET 4;\
3246 +       /*CFI_REL_OFFSET ds, 0;*/\
3247 +       pushl %eax; \
3248 +       CFI_ADJUST_CFA_OFFSET 4;\
3249 +       CFI_REL_OFFSET eax, 0;\
3250 +       pushl %ebp; \
3251 +       CFI_ADJUST_CFA_OFFSET 4;\
3252 +       CFI_REL_OFFSET ebp, 0;\
3253 +       pushl %edi; \
3254 +       CFI_ADJUST_CFA_OFFSET 4;\
3255 +       CFI_REL_OFFSET edi, 0;\
3256 +       pushl %esi; \
3257 +       CFI_ADJUST_CFA_OFFSET 4;\
3258 +       CFI_REL_OFFSET esi, 0;\
3259 +       pushl %edx; \
3260 +       CFI_ADJUST_CFA_OFFSET 4;\
3261 +       CFI_REL_OFFSET edx, 0;\
3262 +       pushl %ecx; \
3263 +       CFI_ADJUST_CFA_OFFSET 4;\
3264 +       CFI_REL_OFFSET ecx, 0;\
3265 +       pushl %ebx; \
3266 +       CFI_ADJUST_CFA_OFFSET 4;\
3267 +       CFI_REL_OFFSET ebx, 0;\
3268 +       movl $(__USER_DS), %edx; \
3269 +       movl %edx, %ds; \
3270 +       movl %edx, %es;
3271 +
3272 +#define RESTORE_INT_REGS \
3273 +       popl %ebx;      \
3274 +       CFI_ADJUST_CFA_OFFSET -4;\
3275 +       CFI_RESTORE ebx;\
3276 +       popl %ecx;      \
3277 +       CFI_ADJUST_CFA_OFFSET -4;\
3278 +       CFI_RESTORE ecx;\
3279 +       popl %edx;      \
3280 +       CFI_ADJUST_CFA_OFFSET -4;\
3281 +       CFI_RESTORE edx;\
3282 +       popl %esi;      \
3283 +       CFI_ADJUST_CFA_OFFSET -4;\
3284 +       CFI_RESTORE esi;\
3285 +       popl %edi;      \
3286 +       CFI_ADJUST_CFA_OFFSET -4;\
3287 +       CFI_RESTORE edi;\
3288 +       popl %ebp;      \
3289 +       CFI_ADJUST_CFA_OFFSET -4;\
3290 +       CFI_RESTORE ebp;\
3291 +       popl %eax;      \
3292 +       CFI_ADJUST_CFA_OFFSET -4;\
3293 +       CFI_RESTORE eax
3294 +
3295 +#define RESTORE_REGS   \
3296 +       RESTORE_INT_REGS; \
3297 +1:     popl %ds;       \
3298 +       CFI_ADJUST_CFA_OFFSET -4;\
3299 +       /*CFI_RESTORE ds;*/\
3300 +2:     popl %es;       \
3301 +       CFI_ADJUST_CFA_OFFSET -4;\
3302 +       /*CFI_RESTORE es;*/\
3303 +.section .fixup,"ax";  \
3304 +3:     movl $0,(%esp); \
3305 +       jmp 1b;         \
3306 +4:     movl $0,(%esp); \
3307 +       jmp 2b;         \
3308 +.previous;             \
3309 +.section __ex_table,"a";\
3310 +       .align 4;       \
3311 +       .long 1b,3b;    \
3312 +       .long 2b,4b;    \
3313 +.previous
3314 +
3315 +#define RING0_INT_FRAME \
3316 +       CFI_STARTPROC simple;\
3317 +       CFI_SIGNAL_FRAME;\
3318 +       CFI_DEF_CFA esp, 3*4;\
3319 +       /*CFI_OFFSET cs, -2*4;*/\
3320 +       CFI_OFFSET eip, -3*4
3321 +
3322 +#define RING0_EC_FRAME \
3323 +       CFI_STARTPROC simple;\
3324 +       CFI_SIGNAL_FRAME;\
3325 +       CFI_DEF_CFA esp, 4*4;\
3326 +       /*CFI_OFFSET cs, -2*4;*/\
3327 +       CFI_OFFSET eip, -3*4
3328 +
3329 +#define RING0_PTREGS_FRAME \
3330 +       CFI_STARTPROC simple;\
3331 +       CFI_SIGNAL_FRAME;\
3332 +       CFI_DEF_CFA esp, OLDESP-EBX;\
3333 +       /*CFI_OFFSET cs, CS-OLDESP;*/\
3334 +       CFI_OFFSET eip, EIP-OLDESP;\
3335 +       /*CFI_OFFSET es, ES-OLDESP;*/\
3336 +       /*CFI_OFFSET ds, DS-OLDESP;*/\
3337 +       CFI_OFFSET eax, EAX-OLDESP;\
3338 +       CFI_OFFSET ebp, EBP-OLDESP;\
3339 +       CFI_OFFSET edi, EDI-OLDESP;\
3340 +       CFI_OFFSET esi, ESI-OLDESP;\
3341 +       CFI_OFFSET edx, EDX-OLDESP;\
3342 +       CFI_OFFSET ecx, ECX-OLDESP;\
3343 +       CFI_OFFSET ebx, EBX-OLDESP
3344 +
3345 +ENTRY(ret_from_fork)
3346 +       CFI_STARTPROC
3347 +       pushl %eax
3348 +       CFI_ADJUST_CFA_OFFSET 4
3349 +       call schedule_tail
3350 +       GET_THREAD_INFO(%ebp)
3351 +       popl %eax
3352 +       CFI_ADJUST_CFA_OFFSET -4
3353 +       pushl $0x0202                   # Reset kernel eflags
3354 +       CFI_ADJUST_CFA_OFFSET 4
3355 +       popfl
3356 +       CFI_ADJUST_CFA_OFFSET -4
3357 +       jmp syscall_exit
3358 +       CFI_ENDPROC
3359 +
3360 +/*
3361 + * Return to user mode is not as complex as all this looks,
3362 + * but we want the default path for a system call return to
3363 + * go as quickly as possible which is why some of this is
3364 + * less clear than it otherwise should be.
3365 + */
3366 +
3367 +       # userspace resumption stub bypassing syscall exit tracing
3368 +       ALIGN
3369 +       RING0_PTREGS_FRAME
3370 +ret_from_exception:
3371 +       preempt_stop
3372 +ret_from_intr:
3373 +       GET_THREAD_INFO(%ebp)
3374 +check_userspace:
3375 +       movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
3376 +       movb CS(%esp), %al
3377 +       andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
3378 +       cmpl $USER_RPL, %eax
3379 +       jb resume_kernel                # not returning to v8086 or userspace
3380 +ENTRY(resume_userspace)
3381 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3382 +                                       # setting need_resched or sigpending
3383 +                                       # between sampling and the iret
3384 +       movl TI_flags(%ebp), %ecx
3385 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
3386 +                                       # int/exception return?
3387 +       jne work_pending
3388 +       jmp restore_all
3389 +
3390 +#ifdef CONFIG_PREEMPT
3391 +ENTRY(resume_kernel)
3392 +       DISABLE_INTERRUPTS
3393 +       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
3394 +       jnz restore_nocheck
3395 +need_resched:
3396 +       movl TI_flags(%ebp), %ecx       # need_resched set ?
3397 +       testb $_TIF_NEED_RESCHED, %cl
3398 +       jz restore_all
3399 +       testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
3400 +       jz restore_all
3401 +       call preempt_schedule_irq
3402 +       jmp need_resched
3403 +#endif
3404 +       CFI_ENDPROC
3405 +
3406 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
3407 +   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
3408 +
3409 +       # sysenter call handler stub
3410 +ENTRY(sysenter_entry)
3411 +       CFI_STARTPROC simple
3412 +       CFI_SIGNAL_FRAME
3413 +       CFI_DEF_CFA esp, 0
3414 +       CFI_REGISTER esp, ebp
3415 +       movl SYSENTER_stack_esp0(%esp),%esp
3416 +sysenter_past_esp:
3417 +       /*
3418 +        * No need to follow this irqs on/off section: the syscall
3419 +        * disabled irqs and here we enable it straight after entry:
3420 +        */
3421 +       ENABLE_INTERRUPTS
3422 +       pushl $(__USER_DS)
3423 +       CFI_ADJUST_CFA_OFFSET 4
3424 +       /*CFI_REL_OFFSET ss, 0*/
3425 +       pushl %ebp
3426 +       CFI_ADJUST_CFA_OFFSET 4
3427 +       CFI_REL_OFFSET esp, 0
3428 +       pushfl
3429 +       CFI_ADJUST_CFA_OFFSET 4
3430 +       pushl $(__USER_CS)
3431 +       CFI_ADJUST_CFA_OFFSET 4
3432 +       /*CFI_REL_OFFSET cs, 0*/
3433 +       /*
3434 +        * Push current_thread_info()->sysenter_return to the stack.
3435 +        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
3436 +        * pushed above; +8 corresponds to copy_thread's esp0 setting.
3437 +        */
3438 +       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
3439 +       CFI_ADJUST_CFA_OFFSET 4
3440 +       CFI_REL_OFFSET eip, 0
3441 +
3442 +/*
3443 + * Load the potential sixth argument from user stack.
3444 + * Careful about security.
3445 + */
3446 +       cmpl $__PAGE_OFFSET-3,%ebp
3447 +       jae syscall_fault
3448 +1:     movl (%ebp),%ebp
3449 +.section __ex_table,"a"
3450 +       .align 4
3451 +       .long 1b,syscall_fault
3452 +.previous
3453 +
3454 +       pushl %eax
3455 +       CFI_ADJUST_CFA_OFFSET 4
3456 +       SAVE_ALL
3457 +       GET_THREAD_INFO(%ebp)
3458 +
3459 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3460 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3461 +       jnz syscall_trace_entry
3462 +       cmpl $(nr_syscalls), %eax
3463 +       jae syscall_badsys
3464 +       call *sys_call_table(,%eax,4)
3465 +       movl %eax,EAX(%esp)
3466 +       DISABLE_INTERRUPTS
3467 +       TRACE_IRQS_OFF
3468 +       movl TI_flags(%ebp), %ecx
3469 +       testw $_TIF_ALLWORK_MASK, %cx
3470 +       jne syscall_exit_work
3471 +/* if something modifies registers it must also disable sysexit */
3472 +       movl EIP(%esp), %edx
3473 +       movl OLDESP(%esp), %ecx
3474 +       xorl %ebp,%ebp
3475 +       TRACE_IRQS_ON
3476 +#ifdef CONFIG_XEN
3477 +       __ENABLE_INTERRUPTS
3478 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
3479 +       __TEST_PENDING
3480 +       jnz  14f                        # process more events if necessary...
3481 +       movl ESI(%esp), %esi
3482 +       sysexit
3483 +14:    __DISABLE_INTERRUPTS
3484 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
3485 +       push %esp
3486 +       CFI_ADJUST_CFA_OFFSET 4
3487 +       call evtchn_do_upcall
3488 +       add  $4,%esp
3489 +       CFI_ADJUST_CFA_OFFSET -4
3490 +       jmp  ret_from_intr
3491 +#else
3492 +       ENABLE_INTERRUPTS_SYSEXIT
3493 +#endif /* !CONFIG_XEN */
3494 +       CFI_ENDPROC
3495 +
3496 +
3497 +       # system call handler stub
3498 +ENTRY(system_call)
3499 +       RING0_INT_FRAME                 # can't unwind into user space anyway
3500 +       pushl %eax                      # save orig_eax
3501 +       CFI_ADJUST_CFA_OFFSET 4
3502 +       SAVE_ALL
3503 +       GET_THREAD_INFO(%ebp)
3504 +       testl $TF_MASK,EFLAGS(%esp)
3505 +       jz no_singlestep
3506 +       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
3507 +no_singlestep:
3508 +                                       # system call tracing in operation / emulation
3509 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3510 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3511 +       jnz syscall_trace_entry
3512 +       cmpl $(nr_syscalls), %eax
3513 +       jae syscall_badsys
3514 +syscall_call:
3515 +       call *sys_call_table(,%eax,4)
3516 +       movl %eax,EAX(%esp)             # store the return value
3517 +syscall_exit:
3518 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3519 +                                       # setting need_resched or sigpending
3520 +                                       # between sampling and the iret
3521 +       TRACE_IRQS_OFF
3522 +       movl TI_flags(%ebp), %ecx
3523 +       testw $_TIF_ALLWORK_MASK, %cx   # current->work
3524 +       jne syscall_exit_work
3525 +
3526 +restore_all:
3527 +#ifndef CONFIG_XEN
3528 +       movl EFLAGS(%esp), %eax         # mix EFLAGS, SS and CS
3529 +       # Warning: OLDSS(%esp) contains the wrong/random values if we
3530 +       # are returning to the kernel.
3531 +       # See comments in process.c:copy_thread() for details.
3532 +       movb OLDSS(%esp), %ah
3533 +       movb CS(%esp), %al
3534 +       andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
3535 +       cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
3536 +       CFI_REMEMBER_STATE
3537 +       je ldt_ss                       # returning to user-space with LDT SS
3538 +restore_nocheck:
3539 +#else
3540 +restore_nocheck:
3541 +       movl EFLAGS(%esp), %eax
3542 +       testl $(VM_MASK|NMI_MASK), %eax
3543 +       jnz hypervisor_iret
3544 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
3545 +       GET_VCPU_INFO
3546 +       andb evtchn_upcall_mask(%esi),%al
3547 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
3548 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
3549 +       CFI_REMEMBER_STATE
3550 +#endif
3551 +       TRACE_IRQS_IRET
3552 +restore_nocheck_notrace:
3553 +       RESTORE_REGS
3554 +       addl $4, %esp
3555 +       CFI_ADJUST_CFA_OFFSET -4
3556 +1:     INTERRUPT_RETURN
3557 +.section .fixup,"ax"
3558 +iret_exc:
3559 +#ifndef CONFIG_XEN
3560 +       TRACE_IRQS_ON
3561 +       ENABLE_INTERRUPTS
3562 +#endif
3563 +       pushl $0                        # no error code
3564 +       pushl $do_iret_error
3565 +       jmp error_code
3566 +.previous
3567 +.section __ex_table,"a"
3568 +       .align 4
3569 +       .long 1b,iret_exc
3570 +.previous
3571 +
3572 +       CFI_RESTORE_STATE
3573 +#ifndef CONFIG_XEN
3574 +ldt_ss:
3575 +       larl OLDSS(%esp), %eax
3576 +       jnz restore_nocheck
3577 +       testl $0x00400000, %eax         # returning to 32bit stack?
3578 +       jnz restore_nocheck             # allright, normal return
3579 +       /* If returning to userspace with 16bit stack,
3580 +        * try to fix the higher word of ESP, as the CPU
3581 +        * won't restore it.
3582 +        * This is an "official" bug of all the x86-compatible
3583 +        * CPUs, which we can try to work around to make
3584 +        * dosemu and wine happy. */
3585 +       subl $8, %esp           # reserve space for switch16 pointer
3586 +       CFI_ADJUST_CFA_OFFSET 8
3587 +       DISABLE_INTERRUPTS
3588 +       TRACE_IRQS_OFF
3589 +       movl %esp, %eax
3590 +       /* Set up the 16bit stack frame with switch32 pointer on top,
3591 +        * and a switch16 pointer on top of the current frame. */
3592 +       call setup_x86_bogus_stack
3593 +       CFI_ADJUST_CFA_OFFSET -8        # frame has moved
3594 +       TRACE_IRQS_IRET
3595 +       RESTORE_REGS
3596 +       lss 20+4(%esp), %esp    # switch to 16bit stack
3597 +1:     INTERRUPT_RETURN
3598 +.section __ex_table,"a"
3599 +       .align 4
3600 +       .long 1b,iret_exc
3601 +.previous
3602 +#else
3603 +hypervisor_iret:
3604 +       andl $~NMI_MASK, EFLAGS(%esp)
3605 +       RESTORE_REGS
3606 +       addl $4, %esp
3607 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
3608 +#endif
3609 +       CFI_ENDPROC
3610 +
3611 +       # perform work that needs to be done immediately before resumption
3612 +       ALIGN
3613 +       RING0_PTREGS_FRAME              # can't unwind into user space anyway
3614 +work_pending:
3615 +       testb $_TIF_NEED_RESCHED, %cl
3616 +       jz work_notifysig
3617 +work_resched:
3618 +       call schedule
3619 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3620 +                                       # setting need_resched or sigpending
3621 +                                       # between sampling and the iret
3622 +       TRACE_IRQS_OFF
3623 +       movl TI_flags(%ebp), %ecx
3624 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
3625 +                                       # than syscall tracing?
3626 +       jz restore_all
3627 +       testb $_TIF_NEED_RESCHED, %cl
3628 +       jnz work_resched
3629 +
3630 +work_notifysig:                                # deal with pending signals and
3631 +                                       # notify-resume requests
3632 +       testl $VM_MASK, EFLAGS(%esp)
3633 +       movl %esp, %eax
3634 +       jne work_notifysig_v86          # returning to kernel-space or
3635 +                                       # vm86-space
3636 +       xorl %edx, %edx
3637 +       call do_notify_resume
3638 +       jmp resume_userspace_sig
3639 +
3640 +       ALIGN
3641 +work_notifysig_v86:
3642 +#ifdef CONFIG_VM86
3643 +       pushl %ecx                      # save ti_flags for do_notify_resume
3644 +       CFI_ADJUST_CFA_OFFSET 4
3645 +       call save_v86_state             # %eax contains pt_regs pointer
3646 +       popl %ecx
3647 +       CFI_ADJUST_CFA_OFFSET -4
3648 +       movl %eax, %esp
3649 +       xorl %edx, %edx
3650 +       call do_notify_resume
3651 +       jmp resume_userspace_sig
3652 +#endif
3653 +
3654 +       # perform syscall exit tracing
3655 +       ALIGN
3656 +syscall_trace_entry:
3657 +       movl $-ENOSYS,EAX(%esp)
3658 +       movl %esp, %eax
3659 +       xorl %edx,%edx
3660 +       call do_syscall_trace
3661 +       cmpl $0, %eax
3662 +       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
3663 +                                       # so must skip actual syscall
3664 +       movl ORIG_EAX(%esp), %eax
3665 +       cmpl $(nr_syscalls), %eax
3666 +       jnae syscall_call
3667 +       jmp syscall_exit
3668 +
3669 +       # perform syscall exit tracing
3670 +       ALIGN
3671 +syscall_exit_work:
3672 +       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
3673 +       jz work_pending
3674 +       TRACE_IRQS_ON
3675 +       ENABLE_INTERRUPTS               # could let do_syscall_trace() call
3676 +                                       # schedule() instead
3677 +       movl %esp, %eax
3678 +       movl $1, %edx
3679 +       call do_syscall_trace
3680 +       jmp resume_userspace
3681 +       CFI_ENDPROC
3682 +
3683 +       RING0_INT_FRAME                 # can't unwind into user space anyway
3684 +syscall_fault:
3685 +       pushl %eax                      # save orig_eax
3686 +       CFI_ADJUST_CFA_OFFSET 4
3687 +       SAVE_ALL
3688 +       GET_THREAD_INFO(%ebp)
3689 +       movl $-EFAULT,EAX(%esp)
3690 +       jmp resume_userspace
3691 +
3692 +syscall_badsys:
3693 +       movl $-ENOSYS,EAX(%esp)
3694 +       jmp resume_userspace
3695 +       CFI_ENDPROC
3696 +
3697 +#ifndef CONFIG_XEN
3698 +#define FIXUP_ESPFIX_STACK \
3699 +       movl %esp, %eax; \
3700 +       /* switch to 32bit stack using the pointer on top of 16bit stack */ \
3701 +       lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
3702 +       /* copy data from 16bit stack to 32bit stack */ \
3703 +       call fixup_x86_bogus_stack; \
3704 +       /* put ESP to the proper location */ \
3705 +       movl %eax, %esp;
3706 +#define UNWIND_ESPFIX_STACK \
3707 +       pushl %eax; \
3708 +       CFI_ADJUST_CFA_OFFSET 4; \
3709 +       movl %ss, %eax; \
3710 +       /* see if on 16bit stack */ \
3711 +       cmpw $__ESPFIX_SS, %ax; \
3712 +       je 28f; \
3713 +27:    popl %eax; \
3714 +       CFI_ADJUST_CFA_OFFSET -4; \
3715 +.section .fixup,"ax"; \
3716 +28:    movl $__KERNEL_DS, %eax; \
3717 +       movl %eax, %ds; \
3718 +       movl %eax, %es; \
3719 +       /* switch to 32bit stack */ \
3720 +       FIXUP_ESPFIX_STACK; \
3721 +       jmp 27b; \
3722 +.previous
3723 +
3724 +/*
3725 + * Build the entry stubs and pointer table with
3726 + * some assembler magic.
3727 + */
3728 +.data
3729 +ENTRY(interrupt)
3730 +.text
3731 +
3732 +vector=0
3733 +ENTRY(irq_entries_start)
3734 +       RING0_INT_FRAME
3735 +.rept NR_IRQS
3736 +       ALIGN
3737 + .if vector
3738 +       CFI_ADJUST_CFA_OFFSET -4
3739 + .endif
3740 +1:     pushl $~(vector)
3741 +       CFI_ADJUST_CFA_OFFSET 4
3742 +       jmp common_interrupt
3743 +.data
3744 +       .long 1b
3745 +.text
3746 +vector=vector+1
3747 +.endr
3748 +
3749 +/*
3750 + * the CPU automatically disables interrupts when executing an IRQ vector,
3751 + * so IRQ-flags tracing has to follow that:
3752 + */
3753 +       ALIGN
3754 +common_interrupt:
3755 +       SAVE_ALL
3756 +       TRACE_IRQS_OFF
3757 +       movl %esp,%eax
3758 +       call do_IRQ
3759 +       jmp ret_from_intr
3760 +       CFI_ENDPROC
3761 +
3762 +#define BUILD_INTERRUPT(name, nr)      \
3763 +ENTRY(name)                            \
3764 +       RING0_INT_FRAME;                \
3765 +       pushl $~(nr);                   \
3766 +       CFI_ADJUST_CFA_OFFSET 4;        \
3767 +       SAVE_ALL;                       \
3768 +       TRACE_IRQS_OFF                  \
3769 +       movl %esp,%eax;                 \
3770 +       call smp_/**/name;              \
3771 +       jmp ret_from_intr;              \
3772 +       CFI_ENDPROC
3773 +
3774 +/* The include is where all of the SMP etc. interrupts come from */
3775 +#include "entry_arch.h"
3776 +#else
3777 +#define UNWIND_ESPFIX_STACK
3778 +#endif
3779 +
3780 +KPROBE_ENTRY(page_fault)
3781 +       RING0_EC_FRAME
3782 +       pushl $do_page_fault
3783 +       CFI_ADJUST_CFA_OFFSET 4
3784 +       ALIGN
3785 +error_code:
3786 +       pushl %ds
3787 +       CFI_ADJUST_CFA_OFFSET 4
3788 +       /*CFI_REL_OFFSET ds, 0*/
3789 +       pushl %eax
3790 +       CFI_ADJUST_CFA_OFFSET 4
3791 +       CFI_REL_OFFSET eax, 0
3792 +       xorl %eax, %eax
3793 +       pushl %ebp
3794 +       CFI_ADJUST_CFA_OFFSET 4
3795 +       CFI_REL_OFFSET ebp, 0
3796 +       pushl %edi
3797 +       CFI_ADJUST_CFA_OFFSET 4
3798 +       CFI_REL_OFFSET edi, 0
3799 +       pushl %esi
3800 +       CFI_ADJUST_CFA_OFFSET 4
3801 +       CFI_REL_OFFSET esi, 0
3802 +       pushl %edx
3803 +       CFI_ADJUST_CFA_OFFSET 4
3804 +       CFI_REL_OFFSET edx, 0
3805 +       decl %eax                       # eax = -1
3806 +       pushl %ecx
3807 +       CFI_ADJUST_CFA_OFFSET 4
3808 +       CFI_REL_OFFSET ecx, 0
3809 +       pushl %ebx
3810 +       CFI_ADJUST_CFA_OFFSET 4
3811 +       CFI_REL_OFFSET ebx, 0
3812 +       cld
3813 +       pushl %es
3814 +       CFI_ADJUST_CFA_OFFSET 4
3815 +       /*CFI_REL_OFFSET es, 0*/
3816 +       UNWIND_ESPFIX_STACK
3817 +       popl %ecx
3818 +       CFI_ADJUST_CFA_OFFSET -4
3819 +       /*CFI_REGISTER es, ecx*/
3820 +       movl ES(%esp), %edi             # get the function address
3821 +       movl ORIG_EAX(%esp), %edx       # get the error code
3822 +       movl %eax, ORIG_EAX(%esp)
3823 +       movl %ecx, ES(%esp)
3824 +       /*CFI_REL_OFFSET es, ES*/
3825 +       movl $(__USER_DS), %ecx
3826 +       movl %ecx, %ds
3827 +       movl %ecx, %es
3828 +       movl %esp,%eax                  # pt_regs pointer
3829 +       call *%edi
3830 +       jmp ret_from_exception
3831 +       CFI_ENDPROC
3832 +KPROBE_END(page_fault)
3833 +
3834 +#ifdef CONFIG_XEN
3835 +# A note on the "critical region" in our callback handler.
3836 +# We want to avoid stacking callback handlers due to events occurring
3837 +# during handling of the last event. To do this, we keep events disabled
3838 +# until we've done all processing. HOWEVER, we must enable events before
3839 +# popping the stack frame (can't be done atomically) and so it would still
3840 +# be possible to get enough handler activations to overflow the stack.
3841 +# Although unlikely, bugs of that kind are hard to track down, so we'd
3842 +# like to avoid the possibility.
3843 +# So, on entry to the handler we detect whether we interrupted an
3844 +# existing activation in its critical region -- if so, we pop the current
3845 +# activation and restart the handler using the previous one.
3846 +#
3847 +# The sysexit critical region is slightly different. sysexit
3848 +# atomically removes the entire stack frame. If we interrupt in the
3849 +# critical region we know that the entire frame is present and correct
3850 +# so we can simply throw away the new one.
3851 +ENTRY(hypervisor_callback)
3852 +       RING0_INT_FRAME
3853 +       pushl %eax
3854 +       CFI_ADJUST_CFA_OFFSET 4
3855 +       SAVE_ALL
3856 +       movl EIP(%esp),%eax
3857 +       cmpl $scrit,%eax
3858 +       jb   11f
3859 +       cmpl $ecrit,%eax
3860 +       jb   critical_region_fixup
3861 +       cmpl $sysexit_scrit,%eax
3862 +       jb   11f
3863 +       cmpl $sysexit_ecrit,%eax
3864 +       ja   11f
3865 +       # interrupted in sysexit critical
3866 +       addl $0x34,%esp                 # Remove cs...ebx from stack frame.
3867 +       # this popped off new frame to reuse the old one, therefore no 
3868 +       # CFI_ADJUST_CFA_OFFSET here
3869 +11:    push %esp
3870 +       CFI_ADJUST_CFA_OFFSET 4
3871 +       call evtchn_do_upcall
3872 +       add  $4,%esp
3873 +       CFI_ADJUST_CFA_OFFSET -4
3874 +       jmp  ret_from_intr
3875 +
3876 +        ALIGN
3877 +restore_all_enable_events:
3878 +       __ENABLE_INTERRUPTS
3879 +scrit: /**** START OF CRITICAL REGION ****/
3880 +       __TEST_PENDING
3881 +       jnz  14f                        # process more events if necessary...
3882 +       RESTORE_REGS
3883 +       addl $4, %esp
3884 +       CFI_ADJUST_CFA_OFFSET -4
3885 +1:     iret
3886 +.section __ex_table,"a"
3887 +       .align 4
3888 +       .long 1b,iret_exc
3889 +.previous
3890 +14:    __DISABLE_INTERRUPTS
3891 +       jmp  11b
3892 +ecrit:  /**** END OF CRITICAL REGION ****/
3893 +# [How we do the fixup]. We want to merge the current stack frame with the
3894 +# just-interrupted frame. How we do this depends on where in the critical
3895 +# region the interrupted handler was executing, and so how many saved
3896 +# registers are in each frame. We do this quickly using the lookup table
3897 +# 'critical_fixup_table'. For each byte offset in the critical region, it
3898 +# provides the number of bytes which have already been popped from the
3899 +# interrupted stack frame.
3900 +critical_region_fixup:
3901 +       addl $critical_fixup_table-scrit,%eax
3902 +       movzbl (%eax),%eax              # %eax contains num bytes popped
3903 +       cmpb $0xff,%al                  # 0xff => vcpu_info critical region
3904 +       jne  15f
3905 +       GET_THREAD_INFO(%ebp)
3906 +        xorl %eax,%eax
3907 +15:    mov  %esp,%esi
3908 +       add  %eax,%esi                  # %esi points at end of src region
3909 +       mov  %esp,%edi
3910 +       add  $0x34,%edi                 # %edi points at end of dst region
3911 +       mov  %eax,%ecx
3912 +       shr  $2,%ecx                    # convert words to bytes
3913 +       je   17f                        # skip loop if nothing to copy
3914 +16:    subl $4,%esi                    # pre-decrementing copy loop
3915 +       subl $4,%edi
3916 +       movl (%esi),%eax
3917 +       movl %eax,(%edi)
3918 +       loop 16b
3919 +17:    movl %edi,%esp                  # final %edi is top of merged stack
3920 +       # this popped off new frame to reuse the old one, therefore no 
3921 +       # CFI_DEF_CFA_OFFSET here
3922 +       jmp  11b
3923 +       CFI_ENDPROC
3924 +
3925 +critical_fixup_table:
3926 +       .byte 0xff,0xff,0xff            # testb $0xff,(%esi) = __TEST_PENDING
3927 +       .byte 0xff,0xff                 # jnz  14f
3928 +       .byte 0x00                      # pop  %ebx
3929 +       .byte 0x04                      # pop  %ecx
3930 +       .byte 0x08                      # pop  %edx
3931 +       .byte 0x0c                      # pop  %esi
3932 +       .byte 0x10                      # pop  %edi
3933 +       .byte 0x14                      # pop  %ebp
3934 +       .byte 0x18                      # pop  %eax
3935 +       .byte 0x1c                      # pop  %ds
3936 +       .byte 0x20                      # pop  %es
3937 +       .byte 0x24,0x24,0x24            # add  $4,%esp
3938 +       .byte 0x28                      # iret
3939 +       .byte 0xff,0xff,0xff,0xff       # movb $1,1(%esi)
3940 +       .byte 0x00,0x00                 # jmp  11b
3941 +
3942 +# Hypervisor uses this for application faults while it executes.
3943 +# We get here for two reasons:
3944 +#  1. Fault while reloading DS, ES, FS or GS
3945 +#  2. Fault while executing IRET
3946 +# Category 1 we fix up by reattempting the load, and zeroing the segment
3947 +# register if the load fails.
3948 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
3949 +# normal Linux return path in this case because if we use the IRET hypercall
3950 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
3951 +# We distinguish between categories by maintaining a status value in EAX.
3952 +ENTRY(failsafe_callback)
3953 +       RING0_INT_FRAME
3954 +       pushl %eax
3955 +       CFI_ADJUST_CFA_OFFSET 4
3956 +       movl $1,%eax
3957 +1:     mov 4(%esp),%ds
3958 +2:     mov 8(%esp),%es
3959 +3:     mov 12(%esp),%fs
3960 +4:     mov 16(%esp),%gs
3961 +       testl %eax,%eax
3962 +       popl %eax
3963 +       CFI_ADJUST_CFA_OFFSET -4
3964 +       jz 5f
3965 +       addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
3966 +       CFI_ADJUST_CFA_OFFSET -16
3967 +       jmp iret_exc
3968 +       CFI_ADJUST_CFA_OFFSET 16
3969 +5:     addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
3970 +       CFI_ADJUST_CFA_OFFSET -16
3971 +       pushl $0
3972 +       CFI_ADJUST_CFA_OFFSET 4
3973 +       SAVE_ALL
3974 +       jmp ret_from_exception
3975 +.section .fixup,"ax";          \
3976 +6:     xorl %eax,%eax;         \
3977 +       movl %eax,4(%esp);      \
3978 +       jmp 1b;                 \
3979 +7:     xorl %eax,%eax;         \
3980 +       movl %eax,8(%esp);      \
3981 +       jmp 2b;                 \
3982 +8:     xorl %eax,%eax;         \
3983 +       movl %eax,12(%esp);     \
3984 +       jmp 3b;                 \
3985 +9:     xorl %eax,%eax;         \
3986 +       movl %eax,16(%esp);     \
3987 +       jmp 4b;                 \
3988 +.previous;                     \
3989 +.section __ex_table,"a";       \
3990 +       .align 4;               \
3991 +       .long 1b,6b;            \
3992 +       .long 2b,7b;            \
3993 +       .long 3b,8b;            \
3994 +       .long 4b,9b;            \
3995 +.previous
3996 +       CFI_ENDPROC
3997 +#endif
3998 +
3999 +ENTRY(coprocessor_error)
4000 +       RING0_INT_FRAME
4001 +       pushl $0
4002 +       CFI_ADJUST_CFA_OFFSET 4
4003 +       pushl $do_coprocessor_error
4004 +       CFI_ADJUST_CFA_OFFSET 4
4005 +       jmp error_code
4006 +       CFI_ENDPROC
4007 +
4008 +ENTRY(simd_coprocessor_error)
4009 +       RING0_INT_FRAME
4010 +       pushl $0
4011 +       CFI_ADJUST_CFA_OFFSET 4
4012 +       pushl $do_simd_coprocessor_error
4013 +       CFI_ADJUST_CFA_OFFSET 4
4014 +       jmp error_code
4015 +       CFI_ENDPROC
4016 +
4017 +ENTRY(device_not_available)
4018 +       RING0_INT_FRAME
4019 +       pushl $-1                       # mark this as an int
4020 +       CFI_ADJUST_CFA_OFFSET 4
4021 +       SAVE_ALL
4022 +#ifndef CONFIG_XEN
4023 +       GET_CR0_INTO_EAX
4024 +       testl $0x4, %eax                # EM (math emulation bit)
4025 +       je device_available_emulate
4026 +       pushl $0                        # temporary storage for ORIG_EIP
4027 +       CFI_ADJUST_CFA_OFFSET 4
4028 +       call math_emulate
4029 +       addl $4, %esp
4030 +       CFI_ADJUST_CFA_OFFSET -4
4031 +       jmp ret_from_exception
4032 +device_available_emulate:
4033 +#endif
4034 +       preempt_stop
4035 +       call math_state_restore
4036 +       jmp ret_from_exception
4037 +       CFI_ENDPROC
4038 +
4039 +#ifndef CONFIG_XEN
4040 +/*
4041 + * Debug traps and NMI can happen at the one SYSENTER instruction
4042 + * that sets up the real kernel stack. Check here, since we can't
4043 + * allow the wrong stack to be used.
4044 + *
4045 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
4046 + * already pushed 3 words if it hits on the sysenter instruction:
4047 + * eflags, cs and eip.
4048 + *
4049 + * We just load the right stack, and push the three (known) values
4050 + * by hand onto the new stack - while updating the return eip past
4051 + * the instruction that would have done it for sysenter.
4052 + */
4053 +#define FIX_STACK(offset, ok, label)           \
4054 +       cmpw $__KERNEL_CS,4(%esp);              \
4055 +       jne ok;                                 \
4056 +label:                                         \
4057 +       movl SYSENTER_stack_esp0+offset(%esp),%esp;     \
4058 +       CFI_DEF_CFA esp, 0;                     \
4059 +       CFI_UNDEFINED eip;                      \
4060 +       pushfl;                                 \
4061 +       CFI_ADJUST_CFA_OFFSET 4;                \
4062 +       pushl $__KERNEL_CS;                     \
4063 +       CFI_ADJUST_CFA_OFFSET 4;                \
4064 +       pushl $sysenter_past_esp;               \
4065 +       CFI_ADJUST_CFA_OFFSET 4;                \
4066 +       CFI_REL_OFFSET eip, 0
4067 +#endif /* CONFIG_XEN */
4068 +
4069 +KPROBE_ENTRY(debug)
4070 +       RING0_INT_FRAME
4071 +#ifndef CONFIG_XEN
4072 +       cmpl $sysenter_entry,(%esp)
4073 +       jne debug_stack_correct
4074 +       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
4075 +debug_stack_correct:
4076 +#endif /* !CONFIG_XEN */
4077 +       pushl $-1                       # mark this as an int
4078 +       CFI_ADJUST_CFA_OFFSET 4
4079 +       SAVE_ALL
4080 +       xorl %edx,%edx                  # error code 0
4081 +       movl %esp,%eax                  # pt_regs pointer
4082 +       call do_debug
4083 +       jmp ret_from_exception
4084 +       CFI_ENDPROC
4085 +KPROBE_END(debug)
4086 +
4087 +#ifndef CONFIG_XEN
4088 +/*
4089 + * NMI is doubly nasty. It can happen _while_ we're handling
4090 + * a debug fault, and the debug fault hasn't yet been able to
4091 + * clear up the stack. So we first check whether we got  an
4092 + * NMI on the sysenter entry path, but after that we need to
4093 + * check whether we got an NMI on the debug path where the debug
4094 + * fault happened on the sysenter path.
4095 + */
4096 +KPROBE_ENTRY(nmi)
4097 +       RING0_INT_FRAME
4098 +       pushl %eax
4099 +       CFI_ADJUST_CFA_OFFSET 4
4100 +       movl %ss, %eax
4101 +       cmpw $__ESPFIX_SS, %ax
4102 +       popl %eax
4103 +       CFI_ADJUST_CFA_OFFSET -4
4104 +       je nmi_16bit_stack
4105 +       cmpl $sysenter_entry,(%esp)
4106 +       je nmi_stack_fixup
4107 +       pushl %eax
4108 +       CFI_ADJUST_CFA_OFFSET 4
4109 +       movl %esp,%eax
4110 +       /* Do not access memory above the end of our stack page,
4111 +        * it might not exist.
4112 +        */
4113 +       andl $(THREAD_SIZE-1),%eax
4114 +       cmpl $(THREAD_SIZE-20),%eax
4115 +       popl %eax
4116 +       CFI_ADJUST_CFA_OFFSET -4
4117 +       jae nmi_stack_correct
4118 +       cmpl $sysenter_entry,12(%esp)
4119 +       je nmi_debug_stack_check
4120 +nmi_stack_correct:
4121 +       /* We have a RING0_INT_FRAME here */
4122 +       pushl %eax
4123 +       CFI_ADJUST_CFA_OFFSET 4
4124 +       SAVE_ALL
4125 +       xorl %edx,%edx          # zero error code
4126 +       movl %esp,%eax          # pt_regs pointer
4127 +       call do_nmi
4128 +       jmp restore_nocheck_notrace
4129 +       CFI_ENDPROC
4130 +
4131 +nmi_stack_fixup:
4132 +       RING0_INT_FRAME
4133 +       FIX_STACK(12,nmi_stack_correct, 1)
4134 +       jmp nmi_stack_correct
4135 +
4136 +nmi_debug_stack_check:
4137 +       /* We have a RING0_INT_FRAME here */
4138 +       cmpw $__KERNEL_CS,16(%esp)
4139 +       jne nmi_stack_correct
4140 +       cmpl $debug,(%esp)
4141 +       jb nmi_stack_correct
4142 +       cmpl $debug_esp_fix_insn,(%esp)
4143 +       ja nmi_stack_correct
4144 +       FIX_STACK(24,nmi_stack_correct, 1)
4145 +       jmp nmi_stack_correct
4146 +
4147 +nmi_16bit_stack:
4148 +       /* We have a RING0_INT_FRAME here.
4149 +        *
4150 +        * create the pointer to lss back
4151 +        */
4152 +       pushl %ss
4153 +       CFI_ADJUST_CFA_OFFSET 4
4154 +       pushl %esp
4155 +       CFI_ADJUST_CFA_OFFSET 4
4156 +       movzwl %sp, %esp
4157 +       addw $4, (%esp)
4158 +       /* copy the iret frame of 12 bytes */
4159 +       .rept 3
4160 +       pushl 16(%esp)
4161 +       CFI_ADJUST_CFA_OFFSET 4
4162 +       .endr
4163 +       pushl %eax
4164 +       CFI_ADJUST_CFA_OFFSET 4
4165 +       SAVE_ALL
4166 +       FIXUP_ESPFIX_STACK              # %eax == %esp
4167 +       CFI_ADJUST_CFA_OFFSET -20       # the frame has now moved
4168 +       xorl %edx,%edx                  # zero error code
4169 +       call do_nmi
4170 +       RESTORE_REGS
4171 +       lss 12+4(%esp), %esp            # back to 16bit stack
4172 +1:     INTERRUPT_RETURN
4173 +       CFI_ENDPROC
4174 +.section __ex_table,"a"
4175 +       .align 4
4176 +       .long 1b,iret_exc
4177 +.previous
4178 +KPROBE_END(nmi)
4179 +#else
4180 +KPROBE_ENTRY(nmi)
4181 +       RING0_INT_FRAME
4182 +       pushl %eax
4183 +       CFI_ADJUST_CFA_OFFSET 4
4184 +       SAVE_ALL
4185 +       xorl %edx,%edx          # zero error code
4186 +       movl %esp,%eax          # pt_regs pointer
4187 +       call do_nmi
4188 +       orl  $NMI_MASK, EFLAGS(%esp)
4189 +       jmp restore_all
4190 +       CFI_ENDPROC
4191 +KPROBE_END(nmi)
4192 +#endif
4193 +
4194 +KPROBE_ENTRY(int3)
4195 +       RING0_INT_FRAME
4196 +       pushl $-1                       # mark this as an int
4197 +       CFI_ADJUST_CFA_OFFSET 4
4198 +       SAVE_ALL
4199 +       xorl %edx,%edx          # zero error code
4200 +       movl %esp,%eax          # pt_regs pointer
4201 +       call do_int3
4202 +       jmp ret_from_exception
4203 +       CFI_ENDPROC
4204 +KPROBE_END(int3)
4205 +
4206 +ENTRY(overflow)
4207 +       RING0_INT_FRAME
4208 +       pushl $0
4209 +       CFI_ADJUST_CFA_OFFSET 4
4210 +       pushl $do_overflow
4211 +       CFI_ADJUST_CFA_OFFSET 4
4212 +       jmp error_code
4213 +       CFI_ENDPROC
4214 +
4215 +ENTRY(bounds)
4216 +       RING0_INT_FRAME
4217 +       pushl $0
4218 +       CFI_ADJUST_CFA_OFFSET 4
4219 +       pushl $do_bounds
4220 +       CFI_ADJUST_CFA_OFFSET 4
4221 +       jmp error_code
4222 +       CFI_ENDPROC
4223 +
4224 +ENTRY(invalid_op)
4225 +       RING0_INT_FRAME
4226 +       pushl $0
4227 +       CFI_ADJUST_CFA_OFFSET 4
4228 +       pushl $do_invalid_op
4229 +       CFI_ADJUST_CFA_OFFSET 4
4230 +       jmp error_code
4231 +       CFI_ENDPROC
4232 +
4233 +ENTRY(coprocessor_segment_overrun)
4234 +       RING0_INT_FRAME
4235 +       pushl $0
4236 +       CFI_ADJUST_CFA_OFFSET 4
4237 +       pushl $do_coprocessor_segment_overrun
4238 +       CFI_ADJUST_CFA_OFFSET 4
4239 +       jmp error_code
4240 +       CFI_ENDPROC
4241 +
4242 +ENTRY(invalid_TSS)
4243 +       RING0_EC_FRAME
4244 +       pushl $do_invalid_TSS
4245 +       CFI_ADJUST_CFA_OFFSET 4
4246 +       jmp error_code
4247 +       CFI_ENDPROC
4248 +
4249 +ENTRY(segment_not_present)
4250 +       RING0_EC_FRAME
4251 +       pushl $do_segment_not_present
4252 +       CFI_ADJUST_CFA_OFFSET 4
4253 +       jmp error_code
4254 +       CFI_ENDPROC
4255 +
4256 +ENTRY(stack_segment)
4257 +       RING0_EC_FRAME
4258 +       pushl $do_stack_segment
4259 +       CFI_ADJUST_CFA_OFFSET 4
4260 +       jmp error_code
4261 +       CFI_ENDPROC
4262 +
4263 +KPROBE_ENTRY(general_protection)
4264 +       RING0_EC_FRAME
4265 +       pushl $do_general_protection
4266 +       CFI_ADJUST_CFA_OFFSET 4
4267 +       jmp error_code
4268 +       CFI_ENDPROC
4269 +KPROBE_END(general_protection)
4270 +
4271 +ENTRY(alignment_check)
4272 +       RING0_EC_FRAME
4273 +       pushl $do_alignment_check
4274 +       CFI_ADJUST_CFA_OFFSET 4
4275 +       jmp error_code
4276 +       CFI_ENDPROC
4277 +
4278 +ENTRY(divide_error)
4279 +       RING0_INT_FRAME
4280 +       pushl $0                        # no error code
4281 +       CFI_ADJUST_CFA_OFFSET 4
4282 +       pushl $do_divide_error
4283 +       CFI_ADJUST_CFA_OFFSET 4
4284 +       jmp error_code
4285 +       CFI_ENDPROC
4286 +
4287 +#ifdef CONFIG_X86_MCE
4288 +ENTRY(machine_check)
4289 +       RING0_INT_FRAME
4290 +       pushl $0
4291 +       CFI_ADJUST_CFA_OFFSET 4
4292 +       pushl machine_check_vector
4293 +       CFI_ADJUST_CFA_OFFSET 4
4294 +       jmp error_code
4295 +       CFI_ENDPROC
4296 +#endif
4297 +
4298 +ENTRY(fixup_4gb_segment)
4299 +       RING0_INT_FRAME
4300 +       pushl $do_fixup_4gb_segment
4301 +       CFI_ADJUST_CFA_OFFSET 4
4302 +       jmp error_code
4303 +       CFI_ENDPROC
4304 +
4305 +#ifdef CONFIG_STACK_UNWIND
4306 +ENTRY(arch_unwind_init_running)
4307 +       CFI_STARTPROC
4308 +       movl    4(%esp), %edx
4309 +       movl    (%esp), %ecx
4310 +       leal    4(%esp), %eax
4311 +       movl    %ebx, EBX(%edx)
4312 +       xorl    %ebx, %ebx
4313 +       movl    %ebx, ECX(%edx)
4314 +       movl    %ebx, EDX(%edx)
4315 +       movl    %esi, ESI(%edx)
4316 +       movl    %edi, EDI(%edx)
4317 +       movl    %ebp, EBP(%edx)
4318 +       movl    %ebx, EAX(%edx)
4319 +       movl    $__USER_DS, DS(%edx)
4320 +       movl    $__USER_DS, ES(%edx)
4321 +       movl    %ebx, ORIG_EAX(%edx)
4322 +       movl    %ecx, EIP(%edx)
4323 +       movl    12(%esp), %ecx
4324 +       movl    $__KERNEL_CS, CS(%edx)
4325 +       movl    %ebx, EFLAGS(%edx)
4326 +       movl    %eax, OLDESP(%edx)
4327 +       movl    8(%esp), %eax
4328 +       movl    %ecx, 8(%esp)
4329 +       movl    EBX(%edx), %ebx
4330 +       movl    $__KERNEL_DS, OLDSS(%edx)
4331 +       jmpl    *%eax
4332 +       CFI_ENDPROC
4333 +ENDPROC(arch_unwind_init_running)
4334 +#endif
4335 +
4336 +ENTRY(kernel_thread_helper)
4337 +       pushl $0                # fake return address for unwinder
4338 +       CFI_STARTPROC
4339 +       movl %edx,%eax
4340 +       push %edx
4341 +       CFI_ADJUST_CFA_OFFSET 4
4342 +       call *%ebx
4343 +       push %eax
4344 +       CFI_ADJUST_CFA_OFFSET 4
4345 +       call do_exit
4346 +       CFI_ENDPROC
4347 +ENDPROC(kernel_thread_helper)
4348 +
4349 +.section .rodata,"a"
4350 +#include "syscall_table.S"
4351 +
4352 +syscall_table_size=(.-sys_call_table)
4353 diff -ruNp linux-2.6.19/arch/i386/kernel/entry.S linux-2.6.19-xen-3.0.4/arch/i386/kernel/entry.S
4354 --- linux-2.6.19/arch/i386/kernel/entry.S       2006-11-29 21:57:37.000000000 +0000
4355 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/entry.S     2007-02-02 19:10:21.000000000 +0000
4356 @@ -281,7 +281,7 @@ ENTRY(sysenter_entry)
4357         CFI_SIGNAL_FRAME
4358         CFI_DEF_CFA esp, 0
4359         CFI_REGISTER esp, ebp
4360 -       movl TSS_sysenter_esp0(%esp),%esp
4361 +       movl SYSENTER_stack_esp0(%esp),%esp
4362  sysenter_past_esp:
4363         /*
4364          * No need to follow this irqs on/off section: the syscall
4365 @@ -699,7 +699,7 @@ device_not_available_emulate:
4366   * that sets up the real kernel stack. Check here, since we can't
4367   * allow the wrong stack to be used.
4368   *
4369 - * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
4370 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
4371   * already pushed 3 words if it hits on the sysenter instruction:
4372   * eflags, cs and eip.
4373   *
4374 @@ -711,7 +711,7 @@ device_not_available_emulate:
4375         cmpw $__KERNEL_CS,4(%esp);              \
4376         jne ok;                                 \
4377  label:                                         \
4378 -       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
4379 +       movl SYSENTER_stack_esp0+offset(%esp),%esp;     \
4380         CFI_DEF_CFA esp, 0;                     \
4381         CFI_UNDEFINED eip;                      \
4382         pushfl;                                 \
4383 diff -ruNp linux-2.6.19/arch/i386/kernel/fixup.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/fixup.c
4384 --- linux-2.6.19/arch/i386/kernel/fixup.c       1970-01-01 00:00:00.000000000 +0000
4385 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/fixup.c     2007-02-02 19:10:21.000000000 +0000
4386 @@ -0,0 +1,95 @@
4387 +/******************************************************************************
4388 + * fixup.c
4389 + * 
4390 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
4391 + * Used to avoid repeated slow emulation of common instructions used by the
4392 + * user-space TLS (Thread-Local Storage) libraries.
4393 + * 
4394 + * **** NOTE ****
4395 + *  Issues with the binary rewriting have caused it to be removed. Instead
4396 + *  we rely on Xen's emulator to boot the kernel, and then print a banner
4397 + *  message recommending that the user disables /lib/tls.
4398 + * 
4399 + * Copyright (c) 2004, K A Fraser
4400 + * 
4401 + * This program is free software; you can redistribute it and/or modify
4402 + * it under the terms of the GNU General Public License as published by
4403 + * the Free Software Foundation; either version 2 of the License, or
4404 + * (at your option) any later version.
4405 + * 
4406 + * This program is distributed in the hope that it will be useful,
4407 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
4408 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
4409 + * GNU General Public License for more details.
4410 + * 
4411 + * You should have received a copy of the GNU General Public License
4412 + * along with this program; if not, write to the Free Software
4413 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
4414 + */
4415 +
4416 +#include <linux/init.h>
4417 +#include <linux/sched.h>
4418 +#include <linux/slab.h>
4419 +#include <linux/kernel.h>
4420 +#include <linux/delay.h>
4421 +#include <linux/version.h>
4422 +
4423 +#define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
4424 +
4425 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
4426 +{
4427 +#if 0
4428 +       static unsigned long printed = 0;
4429 +       char info[100];
4430 +       int i;
4431 +
4432 +       /* Ignore statically-linked init. */
4433 +       if (current->tgid == 1)
4434 +               return;
4435 +            
4436 +       HYPERVISOR_vm_assist(
4437 +               VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify);
4438 +
4439 +       if (test_and_set_bit(0, &printed))
4440 +               return;
4441 +
4442 +       sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
4443 +
4444 +       DP("");
4445 +       DP("***************************************************************");
4446 +       DP("***************************************************************");
4447 +       DP("** WARNING: Currently emulating unsupported memory accesses  **");
4448 +       DP("**          in /lib/tls glibc libraries. The emulation is    **");
4449 +       DP("**          slow. To ensure full performance you should      **");
4450 +       DP("**          install a 'xen-friendly' (nosegneg) version of   **");
4451 +       DP("**          the library, or disable tls support by executing **");
4452 +       DP("**          the following as root:                           **");
4453 +       DP("**          mv /lib/tls /lib/tls.disabled                    **");
4454 +       DP("** Offending process: %-38.38s **", info);
4455 +       DP("***************************************************************");
4456 +       DP("***************************************************************");
4457 +       DP("");
4458 +
4459 +       for (i = 5; i > 0; i--) {
4460 +               touch_softlockup_watchdog();
4461 +               printk("Pausing... %d", i);
4462 +               mdelay(1000);
4463 +               printk("\b\b\b\b\b\b\b\b\b\b\b\b");
4464 +       }
4465 +
4466 +       printk("Continuing...\n\n");
4467 +#else
4468 +       if (printk_ratelimit())
4469 +               printk(KERN_WARNING
4470 +                      "4gb seg fixup, process %s (pid %d), cs:ip %02x:%08lx\n",
4471 +                      current->comm, current->tgid, regs->xcs, regs->eip);
4472 +#endif
4473 +}
4474 +
4475 +static int __init fixup_init(void)
4476 +{
4477 +       HYPERVISOR_vm_assist(
4478 +               VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
4479 +       return 0;
4480 +}
4481 +__initcall(fixup_init);
4482 diff -ruNp linux-2.6.19/arch/i386/kernel/head-xen.S linux-2.6.19-xen-3.0.4/arch/i386/kernel/head-xen.S
4483 --- linux-2.6.19/arch/i386/kernel/head-xen.S    1970-01-01 00:00:00.000000000 +0000
4484 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/head-xen.S  2007-02-02 19:10:21.000000000 +0000
4485 @@ -0,0 +1,201 @@
4486 +
4487 +
4488 +.text
4489 +#include <linux/elfnote.h>
4490 +#include <linux/threads.h>
4491 +#include <linux/linkage.h>
4492 +#include <asm/segment.h>
4493 +#include <asm/page.h>
4494 +#include <asm/thread_info.h>
4495 +#include <asm/asm-offsets.h>
4496 +#include <xen/interface/xen.h>
4497 +#include <xen/interface/elfnote.h>
4498 +
4499 +/*
4500 + * References to members of the new_cpu_data structure.
4501 + */
4502 +
4503 +#define X86            new_cpu_data+CPUINFO_x86
4504 +#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
4505 +#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
4506 +#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
4507 +#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
4508 +#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
4509 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
4510 +#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
4511 +
4512 +#define VIRT_ENTRY_OFFSET 0x0
4513 +.org VIRT_ENTRY_OFFSET
4514 +ENTRY(startup_32)
4515 +       movl %esi,xen_start_info
4516 +       cld
4517 +
4518 +       /* Set up the stack pointer */
4519 +       movl $(init_thread_union+THREAD_SIZE),%esp
4520 +
4521 +       /* get vendor info */
4522 +       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
4523 +       XEN_CPUID
4524 +       movl %eax,X86_CPUID             # save CPUID level
4525 +       movl %ebx,X86_VENDOR_ID         # lo 4 chars
4526 +       movl %edx,X86_VENDOR_ID+4       # next 4 chars
4527 +       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
4528 +
4529 +       movl $1,%eax            # Use the CPUID instruction to get CPU type
4530 +       XEN_CPUID
4531 +       movb %al,%cl            # save reg for future use
4532 +       andb $0x0f,%ah          # mask processor family
4533 +       movb %ah,X86
4534 +       andb $0xf0,%al          # mask model
4535 +       shrb $4,%al
4536 +       movb %al,X86_MODEL
4537 +       andb $0x0f,%cl          # mask mask revision
4538 +       movb %cl,X86_MASK
4539 +       movl %edx,X86_CAPABILITY
4540 +
4541 +       movb $1,X86_HARD_MATH
4542 +
4543 +       xorl %eax,%eax                  # Clear FS/GS and LDT
4544 +       movl %eax,%fs
4545 +       movl %eax,%gs
4546 +       cld                     # gcc2 wants the direction flag cleared at all times
4547 +
4548 +       call start_kernel
4549 +L6:
4550 +       jmp L6                  # main should never return here, but
4551 +                               # just in case, we know what happens.
4552 +
4553 +#define HYPERCALL_PAGE_OFFSET 0x1000
4554 +.org HYPERCALL_PAGE_OFFSET
4555 +ENTRY(hypercall_page)
4556 +.skip 0x1000
4557 +
4558 +/*
4559 + * Real beginning of normal "text" segment
4560 + */
4561 +ENTRY(stext)
4562 +ENTRY(_stext)
4563 +
4564 +/*
4565 + * BSS section
4566 + */
4567 +.section ".bss.page_aligned","w"
4568 +ENTRY(empty_zero_page)
4569 +       .fill 4096,1,0
4570 +
4571 +/*
4572 + * This starts the data section.
4573 + */
4574 +.data
4575 +
4576 +/*
4577 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
4578 + */
4579 +ENTRY(cpu_gdt_table)
4580 +       .quad 0x0000000000000000        /* NULL descriptor */
4581 +       .quad 0x0000000000000000        /* 0x0b reserved */
4582 +       .quad 0x0000000000000000        /* 0x13 reserved */
4583 +       .quad 0x0000000000000000        /* 0x1b reserved */
4584 +       .quad 0x0000000000000000        /* 0x20 unused */
4585 +       .quad 0x0000000000000000        /* 0x28 unused */
4586 +       .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
4587 +       .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
4588 +       .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
4589 +       .quad 0x0000000000000000        /* 0x4b reserved */
4590 +       .quad 0x0000000000000000        /* 0x53 reserved */
4591 +       .quad 0x0000000000000000        /* 0x5b reserved */
4592 +
4593 +       .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
4594 +       .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
4595 +       .quad 0x00cffa000000ffff        /* 0x73 user 4GB code at 0x00000000 */
4596 +       .quad 0x00cff2000000ffff        /* 0x7b user 4GB data at 0x00000000 */
4597 +
4598 +       .quad 0x0000000000000000        /* 0x80 TSS descriptor */
4599 +       .quad 0x0000000000000000        /* 0x88 LDT descriptor */
4600 +
4601 +       /*
4602 +        * Segments used for calling PnP BIOS have byte granularity.
4603 +        * They code segments and data segments have fixed 64k limits,
4604 +        * the transfer segment sizes are set at run time.
4605 +        */
4606 +       .quad 0x0000000000000000        /* 0x90 32-bit code */
4607 +       .quad 0x0000000000000000        /* 0x98 16-bit code */
4608 +       .quad 0x0000000000000000        /* 0xa0 16-bit data */
4609 +       .quad 0x0000000000000000        /* 0xa8 16-bit data */
4610 +       .quad 0x0000000000000000        /* 0xb0 16-bit data */
4611 +
4612 +       /*
4613 +        * The APM segments have byte granularity and their bases
4614 +        * are set at run time.  All have 64k limits.
4615 +        */
4616 +       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
4617 +       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
4618 +       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
4619 +
4620 +       .quad 0x0000000000000000        /* 0xd0 - ESPFIX 16-bit SS */
4621 +       .quad 0x0000000000000000        /* 0xd8 - unused */
4622 +       .quad 0x0000000000000000        /* 0xe0 - unused */
4623 +       .quad 0x0000000000000000        /* 0xe8 - unused */
4624 +       .quad 0x0000000000000000        /* 0xf0 - unused */
4625 +       .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault TSS */
4626 +
4627 +#ifdef CONFIG_XEN_COMPAT_030002
4628 +/*
4629 + * __xen_guest information
4630 + */
4631 +.macro utoa value
4632 + .if (\value) < 0 || (\value) >= 0x10
4633 +       utoa (((\value)>>4)&0x0fffffff)
4634 + .endif
4635 + .if ((\value) & 0xf) < 10
4636 +  .byte '0' + ((\value) & 0xf)
4637 + .else
4638 +  .byte 'A' + ((\value) & 0xf) - 10
4639 + .endif
4640 +.endm
4641 +
4642 +.section __xen_guest
4643 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
4644 +       .ascii  ",XEN_VER=xen-3.0"
4645 +       .ascii  ",VIRT_BASE=0x"
4646 +               utoa __PAGE_OFFSET
4647 +       .ascii  ",ELF_PADDR_OFFSET=0x"
4648 +               utoa __PAGE_OFFSET
4649 +       .ascii  ",VIRT_ENTRY=0x"
4650 +               utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
4651 +       .ascii  ",HYPERCALL_PAGE=0x"
4652 +               utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
4653 +       .ascii  ",FEATURES=writable_page_tables"
4654 +       .ascii           "|writable_descriptor_tables"
4655 +       .ascii           "|auto_translated_physmap"
4656 +       .ascii           "|pae_pgdir_above_4gb"
4657 +       .ascii           "|supervisor_mode_kernel"
4658 +#ifdef CONFIG_X86_PAE
4659 +       .ascii  ",PAE=yes[extended-cr3]"
4660 +#else
4661 +       .ascii  ",PAE=no"
4662 +#endif
4663 +       .ascii  ",LOADER=generic"
4664 +       .byte   0
4665 +#endif /* CONFIG_XEN_COMPAT_030002 */
4666 +
4667 +
4668 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "linux")       
4669 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "2.6")
4670 +       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
4671 +       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long,  __PAGE_OFFSET)
4672 +#ifdef CONFIG_XEN_COMPAT_030002
4673 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long,  __PAGE_OFFSET)
4674 +#else
4675 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long,  0)
4676 +#endif /* !CONFIG_XEN_COMPAT_030002 */
4677 +       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long,  startup_32)
4678 +       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long,  hypercall_page)
4679 +       ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long,  HYPERVISOR_VIRT_START)
4680 +       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
4681 +#ifdef CONFIG_X86_PAE
4682 +       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "yes")
4683 +#else
4684 +       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "no")
4685 +#endif
4686 +       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
4687 diff -ruNp linux-2.6.19/arch/i386/kernel/init_task-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/init_task-xen.c
4688 --- linux-2.6.19/arch/i386/kernel/init_task-xen.c       1970-01-01 00:00:00.000000000 +0000
4689 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/init_task-xen.c     2007-02-02 19:10:21.000000000 +0000
4690 @@ -0,0 +1,51 @@
4691 +#include <linux/mm.h>
4692 +#include <linux/module.h>
4693 +#include <linux/sched.h>
4694 +#include <linux/init.h>
4695 +#include <linux/init_task.h>
4696 +#include <linux/fs.h>
4697 +#include <linux/mqueue.h>
4698 +
4699 +#include <asm/uaccess.h>
4700 +#include <asm/pgtable.h>
4701 +#include <asm/desc.h>
4702 +
4703 +static struct fs_struct init_fs = INIT_FS;
4704 +static struct files_struct init_files = INIT_FILES;
4705 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
4706 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
4707 +
4708 +#define swapper_pg_dir ((pgd_t *)NULL)
4709 +struct mm_struct init_mm = INIT_MM(init_mm);
4710 +#undef swapper_pg_dir
4711 +
4712 +EXPORT_SYMBOL(init_mm);
4713 +
4714 +/*
4715 + * Initial thread structure.
4716 + *
4717 + * We need to make sure that this is THREAD_SIZE aligned due to the
4718 + * way process stacks are handled. This is done by having a special
4719 + * "init_task" linker map entry..
4720 + */
4721 +union thread_union init_thread_union 
4722 +       __attribute__((__section__(".data.init_task"))) =
4723 +               { INIT_THREAD_INFO(init_task) };
4724 +
4725 +/*
4726 + * Initial task structure.
4727 + *
4728 + * All other task structs will be allocated on slabs in fork.c
4729 + */
4730 +struct task_struct init_task = INIT_TASK(init_task);
4731 +
4732 +EXPORT_SYMBOL(init_task);
4733 +
4734 +#ifndef CONFIG_X86_NO_TSS
4735 +/*
4736 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
4737 + * no more per-task TSS's.
4738 + */ 
4739 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
4740 +#endif
4741 +
4742 diff -ruNp linux-2.6.19/arch/i386/kernel/io_apic-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/io_apic-xen.c
4743 --- linux-2.6.19/arch/i386/kernel/io_apic-xen.c 1970-01-01 00:00:00.000000000 +0000
4744 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/io_apic-xen.c       2007-02-02 19:10:21.000000000 +0000
4745 @@ -0,0 +1,2965 @@
4746 +/*
4747 + *     Intel IO-APIC support for multi-Pentium hosts.
4748 + *
4749 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
4750 + *
4751 + *     Many thanks to Stig Venaas for trying out countless experimental
4752 + *     patches and reporting/debugging problems patiently!
4753 + *
4754 + *     (c) 1999, Multiple IO-APIC support, developed by
4755 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
4756 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
4757 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
4758 + *     and Ingo Molnar <mingo@redhat.com>
4759 + *
4760 + *     Fixes
4761 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
4762 + *                                     thanks to Eric Gilmore
4763 + *                                     and Rolf G. Tews
4764 + *                                     for testing these extensively
4765 + *     Paul Diefenbaugh        :       Added full ACPI support
4766 + */
4767 +
4768 +#include <linux/mm.h>
4769 +#include <linux/interrupt.h>
4770 +#include <linux/init.h>
4771 +#include <linux/delay.h>
4772 +#include <linux/sched.h>
4773 +#include <linux/smp_lock.h>
4774 +#include <linux/mc146818rtc.h>
4775 +#include <linux/compiler.h>
4776 +#include <linux/acpi.h>
4777 +#include <linux/module.h>
4778 +#include <linux/sysdev.h>
4779 +#include <linux/pci.h>
4780 +#include <linux/msi.h>
4781 +#include <linux/htirq.h>
4782 +
4783 +#include <asm/io.h>
4784 +#include <asm/smp.h>
4785 +#include <asm/desc.h>
4786 +#include <asm/timer.h>
4787 +#include <asm/i8259.h>
4788 +#include <asm/nmi.h>
4789 +#include <asm/msidef.h>
4790 +#include <asm/hypertransport.h>
4791 +
4792 +#include <mach_apic.h>
4793 +#include <mach_apicdef.h>
4794 +
4795 +#include "io_ports.h"
4796 +
4797 +#ifdef CONFIG_XEN
4798 +
4799 +#include <xen/interface/xen.h>
4800 +#include <xen/interface/physdev.h>
4801 +
4802 +/* Fake i8259 */
4803 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
4804 +#define disable_8259A_irq(_irq)  ((void)0)
4805 +#define i8259A_irq_pending(_irq) (0)
4806 +
4807 +unsigned long io_apic_irqs;
4808 +
4809 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
4810 +{
4811 +       struct physdev_apic apic_op;
4812 +       int ret;
4813 +
4814 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4815 +       apic_op.reg = reg;
4816 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
4817 +       if (ret)
4818 +               return ret;
4819 +       return apic_op.value;
4820 +}
4821 +
4822 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
4823 +{
4824 +       struct physdev_apic apic_op;
4825 +
4826 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4827 +       apic_op.reg = reg;
4828 +       apic_op.value = value;
4829 +       HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
4830 +}
4831 +
4832 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
4833 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
4834 +
4835 +#endif /* CONFIG_XEN */
4836 +
4837 +int (*ioapic_renumber_irq)(int ioapic, int irq);
4838 +atomic_t irq_mis_count;
4839 +
4840 +/* Where if anywhere is the i8259 connect in external int mode */
4841 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
4842 +
4843 +static DEFINE_SPINLOCK(ioapic_lock);
4844 +static DEFINE_SPINLOCK(vector_lock);
4845 +
4846 +int timer_over_8254 __initdata = 1;
4847 +
4848 +/*
4849 + *     Is the SiS APIC rmw bug present ?
4850 + *     -1 = don't know, 0 = no, 1 = yes
4851 + */
4852 +int sis_apic_bug = -1;
4853 +
4854 +/*
4855 + * # of IRQ routing registers
4856 + */
4857 +int nr_ioapic_registers[MAX_IO_APICS];
4858 +
4859 +static int disable_timer_pin_1 __initdata;
4860 +
4861 +/*
4862 + * Rough estimation of how many shared IRQs there are, can
4863 + * be changed anytime.
4864 + */
4865 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
4866 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
4867 +
4868 +/*
4869 + * This is performance-critical, we want to do it O(1)
4870 + *
4871 + * the indexing order of this array favors 1:1 mappings
4872 + * between pins and IRQs.
4873 + */
4874 +
4875 +static struct irq_pin_list {
4876 +       int apic, pin, next;
4877 +} irq_2_pin[PIN_MAP_SIZE];
4878 +
4879 +#ifndef CONFIG_XEN
4880 +struct io_apic {
4881 +       unsigned int index;
4882 +       unsigned int unused[3];
4883 +       unsigned int data;
4884 +};
4885 +
4886 +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
4887 +{
4888 +       return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
4889 +               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
4890 +}
4891 +
4892 +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
4893 +{
4894 +       struct io_apic __iomem *io_apic = io_apic_base(apic);
4895 +       writel(reg, &io_apic->index);
4896 +       return readl(&io_apic->data);
4897 +}
4898 +
4899 +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
4900 +{
4901 +       struct io_apic __iomem *io_apic = io_apic_base(apic);
4902 +       writel(reg, &io_apic->index);
4903 +       writel(value, &io_apic->data);
4904 +}
4905 +
4906 +/*
4907 + * Re-write a value: to be used for read-modify-write
4908 + * cycles where the read already set up the index register.
4909 + *
4910 + * Older SiS APIC requires we rewrite the index register
4911 + */
4912 +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
4913 +{
4914 +       volatile struct io_apic *io_apic = io_apic_base(apic);
4915 +       if (sis_apic_bug)
4916 +               writel(reg, &io_apic->index);
4917 +       writel(value, &io_apic->data);
4918 +}
4919 +#endif /* !CONFIG_XEN */
4920 +
4921 +union entry_union {
4922 +       struct { u32 w1, w2; };
4923 +       struct IO_APIC_route_entry entry;
4924 +};
4925 +
4926 +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
4927 +{
4928 +       union entry_union eu;
4929 +       unsigned long flags;
4930 +       spin_lock_irqsave(&ioapic_lock, flags);
4931 +       eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
4932 +       eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
4933 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4934 +       return eu.entry;
4935 +}
4936 +
4937 +/*
4938 + * When we write a new IO APIC routing entry, we need to write the high
4939 + * word first! If the mask bit in the low word is clear, we will enable
4940 + * the interrupt, and we need to make sure the entry is fully populated
4941 + * before that happens.
4942 + */
4943 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
4944 +{
4945 +       unsigned long flags;
4946 +       union entry_union eu;
4947 +       eu.entry = e;
4948 +       spin_lock_irqsave(&ioapic_lock, flags);
4949 +       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
4950 +       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
4951 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4952 +}
4953 +
4954 +/*
4955 + * When we mask an IO APIC routing entry, we need to write the low
4956 + * word first, in order to set the mask bit before we change the
4957 + * high bits!
4958 + */
4959 +static void ioapic_mask_entry(int apic, int pin)
4960 +{
4961 +       unsigned long flags;
4962 +       union entry_union eu = { .entry.mask = 1 };
4963 +
4964 +       spin_lock_irqsave(&ioapic_lock, flags);
4965 +       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
4966 +       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
4967 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4968 +}
4969 +
4970 +/*
4971 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
4972 + * shared ISA-space IRQs, so we have to support them. We are super
4973 + * fast in the common case, and fast for shared ISA-space IRQs.
4974 + */
4975 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
4976 +{
4977 +       static int first_free_entry = NR_IRQS;
4978 +       struct irq_pin_list *entry = irq_2_pin + irq;
4979 +
4980 +       while (entry->next)
4981 +               entry = irq_2_pin + entry->next;
4982 +
4983 +       if (entry->pin != -1) {
4984 +               entry->next = first_free_entry;
4985 +               entry = irq_2_pin + entry->next;
4986 +               if (++first_free_entry >= PIN_MAP_SIZE)
4987 +                       panic("io_apic.c: whoops");
4988 +       }
4989 +       entry->apic = apic;
4990 +       entry->pin = pin;
4991 +}
4992 +
4993 +#ifdef CONFIG_XEN
4994 +#define clear_IO_APIC() ((void)0)
4995 +#else
4996 +/*
4997 + * Reroute an IRQ to a different pin.
4998 + */
4999 +static void __init replace_pin_at_irq(unsigned int irq,
5000 +                                     int oldapic, int oldpin,
5001 +                                     int newapic, int newpin)
5002 +{
5003 +       struct irq_pin_list *entry = irq_2_pin + irq;
5004 +
5005 +       while (1) {
5006 +               if (entry->apic == oldapic && entry->pin == oldpin) {
5007 +                       entry->apic = newapic;
5008 +                       entry->pin = newpin;
5009 +               }
5010 +               if (!entry->next)
5011 +                       break;
5012 +               entry = irq_2_pin + entry->next;
5013 +       }
5014 +}
5015 +
5016 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
5017 +{
5018 +       struct irq_pin_list *entry = irq_2_pin + irq;
5019 +       unsigned int pin, reg;
5020 +
5021 +       for (;;) {
5022 +               pin = entry->pin;
5023 +               if (pin == -1)
5024 +                       break;
5025 +               reg = io_apic_read(entry->apic, 0x10 + pin*2);
5026 +               reg &= ~disable;
5027 +               reg |= enable;
5028 +               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
5029 +               if (!entry->next)
5030 +                       break;
5031 +               entry = irq_2_pin + entry->next;
5032 +       }
5033 +}
5034 +
5035 +/* mask = 1 */
5036 +static void __mask_IO_APIC_irq (unsigned int irq)
5037 +{
5038 +       __modify_IO_APIC_irq(irq, 0x00010000, 0);
5039 +}
5040 +
5041 +/* mask = 0 */
5042 +static void __unmask_IO_APIC_irq (unsigned int irq)
5043 +{
5044 +       __modify_IO_APIC_irq(irq, 0, 0x00010000);
5045 +}
5046 +
5047 +/* mask = 1, trigger = 0 */
5048 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
5049 +{
5050 +       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
5051 +}
5052 +
5053 +/* mask = 0, trigger = 1 */
5054 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
5055 +{
5056 +       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
5057 +}
5058 +
5059 +static void mask_IO_APIC_irq (unsigned int irq)
5060 +{
5061 +       unsigned long flags;
5062 +
5063 +       spin_lock_irqsave(&ioapic_lock, flags);
5064 +       __mask_IO_APIC_irq(irq);
5065 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5066 +}
5067 +
5068 +static void unmask_IO_APIC_irq (unsigned int irq)
5069 +{
5070 +       unsigned long flags;
5071 +
5072 +       spin_lock_irqsave(&ioapic_lock, flags);
5073 +       __unmask_IO_APIC_irq(irq);
5074 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5075 +}
5076 +
5077 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
5078 +{
5079 +       struct IO_APIC_route_entry entry;
5080 +       
5081 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
5082 +       entry = ioapic_read_entry(apic, pin);
5083 +       if (entry.delivery_mode == dest_SMI)
5084 +               return;
5085 +
5086 +       /*
5087 +        * Disable it in the IO-APIC irq-routing table:
5088 +        */
5089 +       ioapic_mask_entry(apic, pin);
5090 +}
5091 +
5092 +static void clear_IO_APIC (void)
5093 +{
5094 +       int apic, pin;
5095 +
5096 +       for (apic = 0; apic < nr_ioapics; apic++)
5097 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
5098 +                       clear_IO_APIC_pin(apic, pin);
5099 +}
5100 +
5101 +#ifdef CONFIG_SMP
5102 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
5103 +{
5104 +       unsigned long flags;
5105 +       int pin;
5106 +       struct irq_pin_list *entry = irq_2_pin + irq;
5107 +       unsigned int apicid_value;
5108 +       cpumask_t tmp;
5109 +       
5110 +       cpus_and(tmp, cpumask, cpu_online_map);
5111 +       if (cpus_empty(tmp))
5112 +               tmp = TARGET_CPUS;
5113 +
5114 +       cpus_and(cpumask, tmp, CPU_MASK_ALL);
5115 +
5116 +       apicid_value = cpu_mask_to_apicid(cpumask);
5117 +       /* Prepare to do the io_apic_write */
5118 +       apicid_value = apicid_value << 24;
5119 +       spin_lock_irqsave(&ioapic_lock, flags);
5120 +       for (;;) {
5121 +               pin = entry->pin;
5122 +               if (pin == -1)
5123 +                       break;
5124 +               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
5125 +               if (!entry->next)
5126 +                       break;
5127 +               entry = irq_2_pin + entry->next;
5128 +       }
5129 +       set_native_irq_info(irq, cpumask);
5130 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5131 +}
5132 +
5133 +#if defined(CONFIG_IRQBALANCE)
5134 +# include <asm/processor.h>    /* kernel_thread() */
5135 +# include <linux/kernel_stat.h>        /* kstat */
5136 +# include <linux/slab.h>               /* kmalloc() */
5137 +# include <linux/timer.h>      /* time_after() */
5138
5139 +#ifdef CONFIG_BALANCED_IRQ_DEBUG
5140 +#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
5141 +#  define Dprintk(x...) do { TDprintk(x); } while (0)
5142 +# else
5143 +#  define TDprintk(x...) 
5144 +#  define Dprintk(x...) 
5145 +# endif
5146 +
5147 +#define IRQBALANCE_CHECK_ARCH -999
5148 +#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
5149 +#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
5150 +#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
5151 +#define BALANCED_IRQ_LESS_DELTA                (HZ)
5152 +
5153 +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
5154 +static int physical_balance __read_mostly;
5155 +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
5156 +
5157 +static struct irq_cpu_info {
5158 +       unsigned long * last_irq;
5159 +       unsigned long * irq_delta;
5160 +       unsigned long irq;
5161 +} irq_cpu_data[NR_CPUS];
5162 +
5163 +#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
5164 +#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
5165 +#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
5166 +
5167 +#define IDLE_ENOUGH(cpu,now) \
5168 +       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
5169 +
5170 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
5171 +
5172 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
5173 +
5174 +static cpumask_t balance_irq_affinity[NR_IRQS] = {
5175 +       [0 ... NR_IRQS-1] = CPU_MASK_ALL
5176 +};
5177 +
5178 +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
5179 +{
5180 +       balance_irq_affinity[irq] = mask;
5181 +}
5182 +
5183 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
5184 +                       unsigned long now, int direction)
5185 +{
5186 +       int search_idle = 1;
5187 +       int cpu = curr_cpu;
5188 +
5189 +       goto inside;
5190 +
5191 +       do {
5192 +               if (unlikely(cpu == curr_cpu))
5193 +                       search_idle = 0;
5194 +inside:
5195 +               if (direction == 1) {
5196 +                       cpu++;
5197 +                       if (cpu >= NR_CPUS)
5198 +                               cpu = 0;
5199 +               } else {
5200 +                       cpu--;
5201 +                       if (cpu == -1)
5202 +                               cpu = NR_CPUS-1;
5203 +               }
5204 +       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
5205 +                       (search_idle && !IDLE_ENOUGH(cpu,now)));
5206 +
5207 +       return cpu;
5208 +}
5209 +
5210 +static inline void balance_irq(int cpu, int irq)
5211 +{
5212 +       unsigned long now = jiffies;
5213 +       cpumask_t allowed_mask;
5214 +       unsigned int new_cpu;
5215 +               
5216 +       if (irqbalance_disabled)
5217 +               return; 
5218 +
5219 +       cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
5220 +       new_cpu = move(cpu, allowed_mask, now, 1);
5221 +       if (cpu != new_cpu) {
5222 +               set_pending_irq(irq, cpumask_of_cpu(new_cpu));
5223 +       }
5224 +}
5225 +
5226 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
5227 +{
5228 +       int i, j;
5229 +       Dprintk("Rotating IRQs among CPUs.\n");
5230 +       for_each_online_cpu(i) {
5231 +               for (j = 0; j < NR_IRQS; j++) {
5232 +                       if (!irq_desc[j].action)
5233 +                               continue;
5234 +                       /* Is it a significant load ?  */
5235 +                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
5236 +                                               useful_load_threshold)
5237 +                               continue;
5238 +                       balance_irq(i, j);
5239 +               }
5240 +       }
5241 +       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
5242 +               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
5243 +       return;
5244 +}
5245 +
5246 +static void do_irq_balance(void)
5247 +{
5248 +       int i, j;
5249 +       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
5250 +       unsigned long move_this_load = 0;
5251 +       int max_loaded = 0, min_loaded = 0;
5252 +       int load;
5253 +       unsigned long useful_load_threshold = balanced_irq_interval + 10;
5254 +       int selected_irq;
5255 +       int tmp_loaded, first_attempt = 1;
5256 +       unsigned long tmp_cpu_irq;
5257 +       unsigned long imbalance = 0;
5258 +       cpumask_t allowed_mask, target_cpu_mask, tmp;
5259 +
5260 +       for_each_possible_cpu(i) {
5261 +               int package_index;
5262 +               CPU_IRQ(i) = 0;
5263 +               if (!cpu_online(i))
5264 +                       continue;
5265 +               package_index = CPU_TO_PACKAGEINDEX(i);
5266 +               for (j = 0; j < NR_IRQS; j++) {
5267 +                       unsigned long value_now, delta;
5268 +                       /* Is this an active IRQ? */
5269 +                       if (!irq_desc[j].action)
5270 +                               continue;
5271 +                       if ( package_index == i )
5272 +                               IRQ_DELTA(package_index,j) = 0;
5273 +                       /* Determine the total count per processor per IRQ */
5274 +                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
5275 +
5276 +                       /* Determine the activity per processor per IRQ */
5277 +                       delta = value_now - LAST_CPU_IRQ(i,j);
5278 +
5279 +                       /* Update last_cpu_irq[][] for the next time */
5280 +                       LAST_CPU_IRQ(i,j) = value_now;
5281 +
5282 +                       /* Ignore IRQs whose rate is less than the clock */
5283 +                       if (delta < useful_load_threshold)
5284 +                               continue;
5285 +                       /* update the load for the processor or package total */
5286 +                       IRQ_DELTA(package_index,j) += delta;
5287 +
5288 +                       /* Keep track of the higher numbered sibling as well */
5289 +                       if (i != package_index)
5290 +                               CPU_IRQ(i) += delta;
5291 +                       /*
5292 +                        * We have sibling A and sibling B in the package
5293 +                        *
5294 +                        * cpu_irq[A] = load for cpu A + load for cpu B
5295 +                        * cpu_irq[B] = load for cpu B
5296 +                        */
5297 +                       CPU_IRQ(package_index) += delta;
5298 +               }
5299 +       }
5300 +       /* Find the least loaded processor package */
5301 +       for_each_online_cpu(i) {
5302 +               if (i != CPU_TO_PACKAGEINDEX(i))
5303 +                       continue;
5304 +               if (min_cpu_irq > CPU_IRQ(i)) {
5305 +                       min_cpu_irq = CPU_IRQ(i);
5306 +                       min_loaded = i;
5307 +               }
5308 +       }
5309 +       max_cpu_irq = ULONG_MAX;
5310 +
5311 +tryanothercpu:
5312 +       /* Look for heaviest loaded processor.
5313 +        * We may come back to get the next heaviest loaded processor.
5314 +        * Skip processors with trivial loads.
5315 +        */
5316 +       tmp_cpu_irq = 0;
5317 +       tmp_loaded = -1;
5318 +       for_each_online_cpu(i) {
5319 +               if (i != CPU_TO_PACKAGEINDEX(i))
5320 +                       continue;
5321 +               if (max_cpu_irq <= CPU_IRQ(i)) 
5322 +                       continue;
5323 +               if (tmp_cpu_irq < CPU_IRQ(i)) {
5324 +                       tmp_cpu_irq = CPU_IRQ(i);
5325 +                       tmp_loaded = i;
5326 +               }
5327 +       }
5328 +
5329 +       if (tmp_loaded == -1) {
5330 +        /* In the case of small number of heavy interrupt sources, 
5331 +         * loading some of the cpus too much. We use Ingo's original 
5332 +         * approach to rotate them around.
5333 +         */
5334 +               if (!first_attempt && imbalance >= useful_load_threshold) {
5335 +                       rotate_irqs_among_cpus(useful_load_threshold);
5336 +                       return;
5337 +               }
5338 +               goto not_worth_the_effort;
5339 +       }
5340 +       
5341 +       first_attempt = 0;              /* heaviest search */
5342 +       max_cpu_irq = tmp_cpu_irq;      /* load */
5343 +       max_loaded = tmp_loaded;        /* processor */
5344 +       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
5345 +       
5346 +       Dprintk("max_loaded cpu = %d\n", max_loaded);
5347 +       Dprintk("min_loaded cpu = %d\n", min_loaded);
5348 +       Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
5349 +       Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
5350 +       Dprintk("load imbalance = %lu\n", imbalance);
5351 +
5352 +       /* if imbalance is less than approx 10% of max load, then
5353 +        * observe diminishing returns action. - quit
5354 +        */
5355 +       if (imbalance < (max_cpu_irq >> 3)) {
5356 +               Dprintk("Imbalance too trivial\n");
5357 +               goto not_worth_the_effort;
5358 +       }
5359 +
5360 +tryanotherirq:
5361 +       /* if we select an IRQ to move that can't go where we want, then
5362 +        * see if there is another one to try.
5363 +        */
5364 +       move_this_load = 0;
5365 +       selected_irq = -1;
5366 +       for (j = 0; j < NR_IRQS; j++) {
5367 +               /* Is this an active IRQ? */
5368 +               if (!irq_desc[j].action)
5369 +                       continue;
5370 +               if (imbalance <= IRQ_DELTA(max_loaded,j))
5371 +                       continue;
5372 +               /* Try to find the IRQ that is closest to the imbalance
5373 +                * without going over.
5374 +                */
5375 +               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
5376 +                       move_this_load = IRQ_DELTA(max_loaded,j);
5377 +                       selected_irq = j;
5378 +               }
5379 +       }
5380 +       if (selected_irq == -1) {
5381 +               goto tryanothercpu;
5382 +       }
5383 +
5384 +       imbalance = move_this_load;
5385 +       
5386 +       /* For physical_balance case, we accumlated both load
5387 +        * values in the one of the siblings cpu_irq[],
5388 +        * to use the same code for physical and logical processors
5389 +        * as much as possible. 
5390 +        *
5391 +        * NOTE: the cpu_irq[] array holds the sum of the load for
5392 +        * sibling A and sibling B in the slot for the lowest numbered
5393 +        * sibling (A), _AND_ the load for sibling B in the slot for
5394 +        * the higher numbered sibling.
5395 +        *
5396 +        * We seek the least loaded sibling by making the comparison
5397 +        * (A+B)/2 vs B
5398 +        */
5399 +       load = CPU_IRQ(min_loaded) >> 1;
5400 +       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
5401 +               if (load > CPU_IRQ(j)) {
5402 +                       /* This won't change cpu_sibling_map[min_loaded] */
5403 +                       load = CPU_IRQ(j);
5404 +                       min_loaded = j;
5405 +               }
5406 +       }
5407 +
5408 +       cpus_and(allowed_mask,
5409 +               cpu_online_map,
5410 +               balance_irq_affinity[selected_irq]);
5411 +       target_cpu_mask = cpumask_of_cpu(min_loaded);
5412 +       cpus_and(tmp, target_cpu_mask, allowed_mask);
5413 +
5414 +       if (!cpus_empty(tmp)) {
5415 +
5416 +               Dprintk("irq = %d moved to cpu = %d\n",
5417 +                               selected_irq, min_loaded);
5418 +               /* mark for change destination */
5419 +               set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
5420 +
5421 +               /* Since we made a change, come back sooner to 
5422 +                * check for more variation.
5423 +                */
5424 +               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
5425 +                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
5426 +               return;
5427 +       }
5428 +       goto tryanotherirq;
5429 +
5430 +not_worth_the_effort:
5431 +       /*
5432 +        * if we did not find an IRQ to move, then adjust the time interval
5433 +        * upward
5434 +        */
5435 +       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
5436 +               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
5437 +       Dprintk("IRQ worth rotating not found\n");
5438 +       return;
5439 +}
5440 +
5441 +static int balanced_irq(void *unused)
5442 +{
5443 +       int i;
5444 +       unsigned long prev_balance_time = jiffies;
5445 +       long time_remaining = balanced_irq_interval;
5446 +
5447 +       daemonize("kirqd");
5448 +       
5449 +       /* push everything to CPU 0 to give us a starting point.  */
5450 +       for (i = 0 ; i < NR_IRQS ; i++) {
5451 +               irq_desc[i].pending_mask = cpumask_of_cpu(0);
5452 +               set_pending_irq(i, cpumask_of_cpu(0));
5453 +       }
5454 +
5455 +       for ( ; ; ) {
5456 +               time_remaining = schedule_timeout_interruptible(time_remaining);
5457 +               try_to_freeze();
5458 +               if (time_after(jiffies,
5459 +                               prev_balance_time+balanced_irq_interval)) {
5460 +                       preempt_disable();
5461 +                       do_irq_balance();
5462 +                       prev_balance_time = jiffies;
5463 +                       time_remaining = balanced_irq_interval;
5464 +                       preempt_enable();
5465 +               }
5466 +       }
5467 +       return 0;
5468 +}
5469 +
5470 +static int __init balanced_irq_init(void)
5471 +{
5472 +       int i;
5473 +       struct cpuinfo_x86 *c;
5474 +       cpumask_t tmp;
5475 +
5476 +       cpus_shift_right(tmp, cpu_online_map, 2);
5477 +        c = &boot_cpu_data;
5478 +       /* When not overwritten by the command line ask subarchitecture. */
5479 +       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
5480 +               irqbalance_disabled = NO_BALANCE_IRQ;
5481 +       if (irqbalance_disabled)
5482 +               return 0;
5483 +       
5484 +        /* disable irqbalance completely if there is only one processor online */
5485 +       if (num_online_cpus() < 2) {
5486 +               irqbalance_disabled = 1;
5487 +               return 0;
5488 +       }
5489 +       /*
5490 +        * Enable physical balance only if more than 1 physical processor
5491 +        * is present
5492 +        */
5493 +       if (smp_num_siblings > 1 && !cpus_empty(tmp))
5494 +               physical_balance = 1;
5495 +
5496 +       for_each_online_cpu(i) {
5497 +               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
5498 +               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
5499 +               if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
5500 +                       printk(KERN_ERR "balanced_irq_init: out of memory");
5501 +                       goto failed;
5502 +               }
5503 +               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
5504 +               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
5505 +       }
5506 +       
5507 +       printk(KERN_INFO "Starting balanced_irq\n");
5508 +       if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
5509 +               return 0;
5510 +       else 
5511 +               printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
5512 +failed:
5513 +       for_each_possible_cpu(i) {
5514 +               kfree(irq_cpu_data[i].irq_delta);
5515 +               irq_cpu_data[i].irq_delta = NULL;
5516 +               kfree(irq_cpu_data[i].last_irq);
5517 +               irq_cpu_data[i].last_irq = NULL;
5518 +       }
5519 +       return 0;
5520 +}
5521 +
5522 +int __init irqbalance_disable(char *str)
5523 +{
5524 +       irqbalance_disabled = 1;
5525 +       return 1;
5526 +}
5527 +
5528 +__setup("noirqbalance", irqbalance_disable);
5529 +
5530 +late_initcall(balanced_irq_init);
5531 +#endif /* CONFIG_IRQBALANCE */
5532 +#endif /* CONFIG_SMP */
5533 +#endif /* !CONFIG_XEN */
5534 +
5535 +#ifndef CONFIG_SMP
5536 +void fastcall send_IPI_self(int vector)
5537 +{
5538 +#ifndef CONFIG_XEN
5539 +       unsigned int cfg;
5540 +
5541 +       /*
5542 +        * Wait for idle.
5543 +        */
5544 +       apic_wait_icr_idle();
5545 +       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
5546 +       /*
5547 +        * Send the IPI. The write to APIC_ICR fires this off.
5548 +        */
5549 +       apic_write_around(APIC_ICR, cfg);
5550 +#endif
5551 +}
5552 +#endif /* !CONFIG_SMP */
5553 +
5554 +
5555 +/*
5556 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
5557 + * specific CPU-side IRQs.
5558 + */
5559 +
5560 +#define MAX_PIRQS 8
5561 +static int pirq_entries [MAX_PIRQS];
5562 +static int pirqs_enabled;
5563 +int skip_ioapic_setup;
5564 +
5565 +static int __init ioapic_setup(char *str)
5566 +{
5567 +       skip_ioapic_setup = 1;
5568 +       return 1;
5569 +}
5570 +
5571 +__setup("noapic", ioapic_setup);
5572 +
5573 +static int __init ioapic_pirq_setup(char *str)
5574 +{
5575 +       int i, max;
5576 +       int ints[MAX_PIRQS+1];
5577 +
5578 +       get_options(str, ARRAY_SIZE(ints), ints);
5579 +
5580 +       for (i = 0; i < MAX_PIRQS; i++)
5581 +               pirq_entries[i] = -1;
5582 +
5583 +       pirqs_enabled = 1;
5584 +       apic_printk(APIC_VERBOSE, KERN_INFO
5585 +                       "PIRQ redirection, working around broken MP-BIOS.\n");
5586 +       max = MAX_PIRQS;
5587 +       if (ints[0] < MAX_PIRQS)
5588 +               max = ints[0];
5589 +
5590 +       for (i = 0; i < max; i++) {
5591 +               apic_printk(APIC_VERBOSE, KERN_DEBUG
5592 +                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
5593 +               /*
5594 +                * PIRQs are mapped upside down, usually.
5595 +                */
5596 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
5597 +       }
5598 +       return 1;
5599 +}
5600 +
5601 +__setup("pirq=", ioapic_pirq_setup);
5602 +
5603 +/*
5604 + * Find the IRQ entry number of a certain pin.
5605 + */
5606 +static int find_irq_entry(int apic, int pin, int type)
5607 +{
5608 +       int i;
5609 +
5610 +       for (i = 0; i < mp_irq_entries; i++)
5611 +               if (mp_irqs[i].mpc_irqtype == type &&
5612 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
5613 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
5614 +                   mp_irqs[i].mpc_dstirq == pin)
5615 +                       return i;
5616 +
5617 +       return -1;
5618 +}
5619 +
5620 +/*
5621 + * Find the pin to which IRQ[irq] (ISA) is connected
5622 + */
5623 +static int __init find_isa_irq_pin(int irq, int type)
5624 +{
5625 +       int i;
5626 +
5627 +       for (i = 0; i < mp_irq_entries; i++) {
5628 +               int lbus = mp_irqs[i].mpc_srcbus;
5629 +
5630 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
5631 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
5632 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
5633 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
5634 +                   ) &&
5635 +                   (mp_irqs[i].mpc_irqtype == type) &&
5636 +                   (mp_irqs[i].mpc_srcbusirq == irq))
5637 +
5638 +                       return mp_irqs[i].mpc_dstirq;
5639 +       }
5640 +       return -1;
5641 +}
5642 +
5643 +static int __init find_isa_irq_apic(int irq, int type)
5644 +{
5645 +       int i;
5646 +
5647 +       for (i = 0; i < mp_irq_entries; i++) {
5648 +               int lbus = mp_irqs[i].mpc_srcbus;
5649 +
5650 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
5651 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
5652 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
5653 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
5654 +                   ) &&
5655 +                   (mp_irqs[i].mpc_irqtype == type) &&
5656 +                   (mp_irqs[i].mpc_srcbusirq == irq))
5657 +                       break;
5658 +       }
5659 +       if (i < mp_irq_entries) {
5660 +               int apic;
5661 +               for(apic = 0; apic < nr_ioapics; apic++) {
5662 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
5663 +                               return apic;
5664 +               }
5665 +       }
5666 +
5667 +       return -1;
5668 +}
5669 +
5670 +/*
5671 + * Find a specific PCI IRQ entry.
5672 + * Not an __init, possibly needed by modules
5673 + */
5674 +static int pin_2_irq(int idx, int apic, int pin);
5675 +
5676 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
5677 +{
5678 +       int apic, i, best_guess = -1;
5679 +
5680 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
5681 +               "slot:%d, pin:%d.\n", bus, slot, pin);
5682 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
5683 +               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
5684 +               return -1;
5685 +       }
5686 +       for (i = 0; i < mp_irq_entries; i++) {
5687 +               int lbus = mp_irqs[i].mpc_srcbus;
5688 +
5689 +               for (apic = 0; apic < nr_ioapics; apic++)
5690 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
5691 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
5692 +                               break;
5693 +
5694 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
5695 +                   !mp_irqs[i].mpc_irqtype &&
5696 +                   (bus == lbus) &&
5697 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
5698 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
5699 +
5700 +                       if (!(apic || IO_APIC_IRQ(irq)))
5701 +                               continue;
5702 +
5703 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
5704 +                               return irq;
5705 +                       /*
5706 +                        * Use the first all-but-pin matching entry as a
5707 +                        * best-guess fuzzy result for broken mptables.
5708 +                        */
5709 +                       if (best_guess < 0)
5710 +                               best_guess = irq;
5711 +               }
5712 +       }
5713 +       return best_guess;
5714 +}
5715 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
5716 +
5717 +/*
5718 + * This function currently is only a helper for the i386 smp boot process where 
5719 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
5720 + * so mask in all cases should simply be TARGET_CPUS
5721 + */
5722 +#ifdef CONFIG_SMP
5723 +#ifndef CONFIG_XEN
5724 +void __init setup_ioapic_dest(void)
5725 +{
5726 +       int pin, ioapic, irq, irq_entry;
5727 +
5728 +       if (skip_ioapic_setup == 1)
5729 +               return;
5730 +
5731 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
5732 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
5733 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
5734 +                       if (irq_entry == -1)
5735 +                               continue;
5736 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
5737 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
5738 +               }
5739 +
5740 +       }
5741 +}
5742 +#endif /* !CONFIG_XEN */
5743 +#endif
5744 +
5745 +/*
5746 + * EISA Edge/Level control register, ELCR
5747 + */
5748 +static int EISA_ELCR(unsigned int irq)
5749 +{
5750 +       if (irq < 16) {
5751 +               unsigned int port = 0x4d0 + (irq >> 3);
5752 +               return (inb(port) >> (irq & 7)) & 1;
5753 +       }
5754 +       apic_printk(APIC_VERBOSE, KERN_INFO
5755 +                       "Broken MPtable reports ISA irq %d\n", irq);
5756 +       return 0;
5757 +}
5758 +
5759 +/* EISA interrupts are always polarity zero and can be edge or level
5760 + * trigger depending on the ELCR value.  If an interrupt is listed as
5761 + * EISA conforming in the MP table, that means its trigger type must
5762 + * be read in from the ELCR */
5763 +
5764 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
5765 +#define default_EISA_polarity(idx)     (0)
5766 +
5767 +/* ISA interrupts are always polarity zero edge triggered,
5768 + * when listed as conforming in the MP table. */
5769 +
5770 +#define default_ISA_trigger(idx)       (0)
5771 +#define default_ISA_polarity(idx)      (0)
5772 +
5773 +/* PCI interrupts are always polarity one level triggered,
5774 + * when listed as conforming in the MP table. */
5775 +
5776 +#define default_PCI_trigger(idx)       (1)
5777 +#define default_PCI_polarity(idx)      (1)
5778 +
5779 +/* MCA interrupts are always polarity zero level triggered,
5780 + * when listed as conforming in the MP table. */
5781 +
5782 +#define default_MCA_trigger(idx)       (1)
5783 +#define default_MCA_polarity(idx)      (0)
5784 +
5785 +/* NEC98 interrupts are always polarity zero edge triggered,
5786 + * when listed as conforming in the MP table. */
5787 +
5788 +#define default_NEC98_trigger(idx)     (0)
5789 +#define default_NEC98_polarity(idx)    (0)
5790 +
5791 +static int __init MPBIOS_polarity(int idx)
5792 +{
5793 +       int bus = mp_irqs[idx].mpc_srcbus;
5794 +       int polarity;
5795 +
5796 +       /*
5797 +        * Determine IRQ line polarity (high active or low active):
5798 +        */
5799 +       switch (mp_irqs[idx].mpc_irqflag & 3)
5800 +       {
5801 +               case 0: /* conforms, ie. bus-type dependent polarity */
5802 +               {
5803 +                       switch (mp_bus_id_to_type[bus])
5804 +                       {
5805 +                               case MP_BUS_ISA: /* ISA pin */
5806 +                               {
5807 +                                       polarity = default_ISA_polarity(idx);
5808 +                                       break;
5809 +                               }
5810 +                               case MP_BUS_EISA: /* EISA pin */
5811 +                               {
5812 +                                       polarity = default_EISA_polarity(idx);
5813 +                                       break;
5814 +                               }
5815 +                               case MP_BUS_PCI: /* PCI pin */
5816 +                               {
5817 +                                       polarity = default_PCI_polarity(idx);
5818 +                                       break;
5819 +                               }
5820 +                               case MP_BUS_MCA: /* MCA pin */
5821 +                               {
5822 +                                       polarity = default_MCA_polarity(idx);
5823 +                                       break;
5824 +                               }
5825 +                               case MP_BUS_NEC98: /* NEC 98 pin */
5826 +                               {
5827 +                                       polarity = default_NEC98_polarity(idx);
5828 +                                       break;
5829 +                               }
5830 +                               default:
5831 +                               {
5832 +                                       printk(KERN_WARNING "broken BIOS!!\n");
5833 +                                       polarity = 1;
5834 +                                       break;
5835 +                               }
5836 +                       }
5837 +                       break;
5838 +               }
5839 +               case 1: /* high active */
5840 +               {
5841 +                       polarity = 0;
5842 +                       break;
5843 +               }
5844 +               case 2: /* reserved */
5845 +               {
5846 +                       printk(KERN_WARNING "broken BIOS!!\n");
5847 +                       polarity = 1;
5848 +                       break;
5849 +               }
5850 +               case 3: /* low active */
5851 +               {
5852 +                       polarity = 1;
5853 +                       break;
5854 +               }
5855 +               default: /* invalid */
5856 +               {
5857 +                       printk(KERN_WARNING "broken BIOS!!\n");
5858 +                       polarity = 1;
5859 +                       break;
5860 +               }
5861 +       }
5862 +       return polarity;
5863 +}
5864 +
5865 +static int MPBIOS_trigger(int idx)
5866 +{
5867 +       int bus = mp_irqs[idx].mpc_srcbus;
5868 +       int trigger;
5869 +
5870 +       /*
5871 +        * Determine IRQ trigger mode (edge or level sensitive):
5872 +        */
5873 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
5874 +       {
5875 +               case 0: /* conforms, ie. bus-type dependent */
5876 +               {
5877 +                       switch (mp_bus_id_to_type[bus])
5878 +                       {
5879 +                               case MP_BUS_ISA: /* ISA pin */
5880 +                               {
5881 +                                       trigger = default_ISA_trigger(idx);
5882 +                                       break;
5883 +                               }
5884 +                               case MP_BUS_EISA: /* EISA pin */
5885 +                               {
5886 +                                       trigger = default_EISA_trigger(idx);
5887 +                                       break;
5888 +                               }
5889 +                               case MP_BUS_PCI: /* PCI pin */
5890 +                               {
5891 +                                       trigger = default_PCI_trigger(idx);
5892 +                                       break;
5893 +                               }
5894 +                               case MP_BUS_MCA: /* MCA pin */
5895 +                               {
5896 +                                       trigger = default_MCA_trigger(idx);
5897 +                                       break;
5898 +                               }
5899 +                               case MP_BUS_NEC98: /* NEC 98 pin */
5900 +                               {
5901 +                                       trigger = default_NEC98_trigger(idx);
5902 +                                       break;
5903 +                               }
5904 +                               default:
5905 +                               {
5906 +                                       printk(KERN_WARNING "broken BIOS!!\n");
5907 +                                       trigger = 1;
5908 +                                       break;
5909 +                               }
5910 +                       }
5911 +                       break;
5912 +               }
5913 +               case 1: /* edge */
5914 +               {
5915 +                       trigger = 0;
5916 +                       break;
5917 +               }
5918 +               case 2: /* reserved */
5919 +               {
5920 +                       printk(KERN_WARNING "broken BIOS!!\n");
5921 +                       trigger = 1;
5922 +                       break;
5923 +               }
5924 +               case 3: /* level */
5925 +               {
5926 +                       trigger = 1;
5927 +                       break;
5928 +               }
5929 +               default: /* invalid */
5930 +               {
5931 +                       printk(KERN_WARNING "broken BIOS!!\n");
5932 +                       trigger = 0;
5933 +                       break;
5934 +               }
5935 +       }
5936 +       return trigger;
5937 +}
5938 +
5939 +static inline int irq_polarity(int idx)
5940 +{
5941 +       return MPBIOS_polarity(idx);
5942 +}
5943 +
5944 +static inline int irq_trigger(int idx)
5945 +{
5946 +       return MPBIOS_trigger(idx);
5947 +}
5948 +
5949 +static int pin_2_irq(int idx, int apic, int pin)
5950 +{
5951 +       int irq, i;
5952 +       int bus = mp_irqs[idx].mpc_srcbus;
5953 +
5954 +       /*
5955 +        * Debugging check, we are in big trouble if this message pops up!
5956 +        */
5957 +       if (mp_irqs[idx].mpc_dstirq != pin)
5958 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
5959 +
5960 +       switch (mp_bus_id_to_type[bus])
5961 +       {
5962 +               case MP_BUS_ISA: /* ISA pin */
5963 +               case MP_BUS_EISA:
5964 +               case MP_BUS_MCA:
5965 +               case MP_BUS_NEC98:
5966 +               {
5967 +                       irq = mp_irqs[idx].mpc_srcbusirq;
5968 +                       break;
5969 +               }
5970 +               case MP_BUS_PCI: /* PCI pin */
5971 +               {
5972 +                       /*
5973 +                        * PCI IRQs are mapped in order
5974 +                        */
5975 +                       i = irq = 0;
5976 +                       while (i < apic)
5977 +                               irq += nr_ioapic_registers[i++];
5978 +                       irq += pin;
5979 +
5980 +                       /*
5981 +                        * For MPS mode, so far only needed by ES7000 platform
5982 +                        */
5983 +                       if (ioapic_renumber_irq)
5984 +                               irq = ioapic_renumber_irq(apic, irq);
5985 +
5986 +                       break;
5987 +               }
5988 +               default:
5989 +               {
5990 +                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
5991 +                       irq = 0;
5992 +                       break;
5993 +               }
5994 +       }
5995 +
5996 +       /*
5997 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
5998 +        */
5999 +       if ((pin >= 16) && (pin <= 23)) {
6000 +               if (pirq_entries[pin-16] != -1) {
6001 +                       if (!pirq_entries[pin-16]) {
6002 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
6003 +                                               "disabling PIRQ%d\n", pin-16);
6004 +                       } else {
6005 +                               irq = pirq_entries[pin-16];
6006 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
6007 +                                               "using PIRQ%d -> IRQ %d\n",
6008 +                                               pin-16, irq);
6009 +                       }
6010 +               }
6011 +       }
6012 +       return irq;
6013 +}
6014 +
6015 +static inline int IO_APIC_irq_trigger(int irq)
6016 +{
6017 +       int apic, idx, pin;
6018 +
6019 +       for (apic = 0; apic < nr_ioapics; apic++) {
6020 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6021 +                       idx = find_irq_entry(apic,pin,mp_INT);
6022 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6023 +                               return irq_trigger(idx);
6024 +               }
6025 +       }
6026 +       /*
6027 +        * nonexistent IRQs are edge default
6028 +        */
6029 +       return 0;
6030 +}
6031 +
6032 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
6033 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
6034 +
6035 +static int __assign_irq_vector(int irq)
6036 +{
6037 +       struct physdev_irq irq_op;
6038 +       int vector;
6039 +
6040 +       BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
6041 +
6042 +       if (irq_vector[irq] > 0) {
6043 +               return irq_vector[irq];
6044 +       }
6045 +       irq_op.irq = irq;
6046 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
6047 +               return -ENOSPC;
6048 +
6049 +       vector = irq_op.vector;
6050 +       irq_vector[irq] = vector;
6051 +
6052 +       return vector;
6053 +}
6054 +
6055 +static int assign_irq_vector(int irq)
6056 +{
6057 +       unsigned long flags;
6058 +       int vector;
6059 +
6060 +       spin_lock_irqsave(&vector_lock, flags);
6061 +       vector = __assign_irq_vector(irq);
6062 +       spin_unlock_irqrestore(&vector_lock, flags);
6063 +
6064 +       return vector;
6065 +}
6066 +#ifndef CONFIG_XEN
6067 +static struct irq_chip ioapic_chip;
6068 +
6069 +#define IOAPIC_AUTO    -1
6070 +#define IOAPIC_EDGE    0
6071 +#define IOAPIC_LEVEL   1
6072 +
6073 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
6074 +{
6075 +       if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
6076 +                       trigger == IOAPIC_LEVEL)
6077 +               set_irq_chip_and_handler_name(irq, &ioapic_chip,
6078 +                                        handle_fasteoi_irq, "fasteoi");
6079 +       else {
6080 +               irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
6081 +               set_irq_chip_and_handler_name(irq, &ioapic_chip,
6082 +                                        handle_edge_irq, "edge");
6083 +       }
6084 +       set_intr_gate(vector, interrupt[irq]);
6085 +}
6086 +#else
6087 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
6088 +#endif
6089 +
6090 +static void __init setup_IO_APIC_irqs(void)
6091 +{
6092 +       struct IO_APIC_route_entry entry;
6093 +       int apic, pin, idx, irq, first_notcon = 1, vector;
6094 +       unsigned long flags;
6095 +
6096 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
6097 +
6098 +       for (apic = 0; apic < nr_ioapics; apic++) {
6099 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6100 +
6101 +               /*
6102 +                * add it to the IO-APIC irq-routing table:
6103 +                */
6104 +               memset(&entry,0,sizeof(entry));
6105 +
6106 +               entry.delivery_mode = INT_DELIVERY_MODE;
6107 +               entry.dest_mode = INT_DEST_MODE;
6108 +               entry.mask = 0;                         /* enable IRQ */
6109 +               entry.dest.logical.logical_dest = 
6110 +                                       cpu_mask_to_apicid(TARGET_CPUS);
6111 +
6112 +               idx = find_irq_entry(apic,pin,mp_INT);
6113 +               if (idx == -1) {
6114 +                       if (first_notcon) {
6115 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
6116 +                                               " IO-APIC (apicid-pin) %d-%d",
6117 +                                               mp_ioapics[apic].mpc_apicid,
6118 +                                               pin);
6119 +                               first_notcon = 0;
6120 +                       } else
6121 +                               apic_printk(APIC_VERBOSE, ", %d-%d",
6122 +                                       mp_ioapics[apic].mpc_apicid, pin);
6123 +                       continue;
6124 +               }
6125 +
6126 +               entry.trigger = irq_trigger(idx);
6127 +               entry.polarity = irq_polarity(idx);
6128 +
6129 +               if (irq_trigger(idx)) {
6130 +                       entry.trigger = 1;
6131 +                       entry.mask = 1;
6132 +               }
6133 +
6134 +               irq = pin_2_irq(idx, apic, pin);
6135 +               /*
6136 +                * skip adding the timer int on secondary nodes, which causes
6137 +                * a small but painful rift in the time-space continuum
6138 +                */
6139 +               if (multi_timer_check(apic, irq))
6140 +                       continue;
6141 +               else
6142 +                       add_pin_to_irq(irq, apic, pin);
6143 +
6144 +               if (/*!apic &&*/ !IO_APIC_IRQ(irq))
6145 +                       continue;
6146 +
6147 +               if (IO_APIC_IRQ(irq)) {
6148 +                       vector = assign_irq_vector(irq);
6149 +                       entry.vector = vector;
6150 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6151 +               
6152 +                       if (!apic && (irq < 16))
6153 +                               disable_8259A_irq(irq);
6154 +               }
6155 +               ioapic_write_entry(apic, pin, entry);
6156 +               spin_lock_irqsave(&ioapic_lock, flags);
6157 +               set_native_irq_info(irq, TARGET_CPUS);
6158 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6159 +       }
6160 +       }
6161 +
6162 +       if (!first_notcon)
6163 +               apic_printk(APIC_VERBOSE, " not connected.\n");
6164 +}
6165 +
6166 +/*
6167 + * Set up the 8259A-master output pin:
6168 + */
6169 +#ifndef CONFIG_XEN
6170 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6171 +{
6172 +       struct IO_APIC_route_entry entry;
6173 +
6174 +       memset(&entry,0,sizeof(entry));
6175 +
6176 +       disable_8259A_irq(0);
6177 +
6178 +       /* mask LVT0 */
6179 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6180 +
6181 +       /*
6182 +        * We use logical delivery to get the timer IRQ
6183 +        * to the first CPU.
6184 +        */
6185 +       entry.dest_mode = INT_DEST_MODE;
6186 +       entry.mask = 0;                                 /* unmask IRQ now */
6187 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6188 +       entry.delivery_mode = INT_DELIVERY_MODE;
6189 +       entry.polarity = 0;
6190 +       entry.trigger = 0;
6191 +       entry.vector = vector;
6192 +
6193 +       /*
6194 +        * The timer IRQ doesn't have to know that behind the
6195 +        * scene we have a 8259A-master in AEOI mode ...
6196 +        */
6197 +       irq_desc[0].chip = &ioapic_chip;
6198 +       set_irq_handler(0, handle_edge_irq);
6199 +
6200 +       /*
6201 +        * Add it to the IO-APIC irq-routing table:
6202 +        */
6203 +       ioapic_write_entry(apic, pin, entry);
6204 +
6205 +       enable_8259A_irq(0);
6206 +}
6207 +
6208 +static inline void UNEXPECTED_IO_APIC(void)
6209 +{
6210 +}
6211 +
6212 +void __init print_IO_APIC(void)
6213 +{
6214 +       int apic, i;
6215 +       union IO_APIC_reg_00 reg_00;
6216 +       union IO_APIC_reg_01 reg_01;
6217 +       union IO_APIC_reg_02 reg_02;
6218 +       union IO_APIC_reg_03 reg_03;
6219 +       unsigned long flags;
6220 +
6221 +       if (apic_verbosity == APIC_QUIET)
6222 +               return;
6223 +
6224 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6225 +       for (i = 0; i < nr_ioapics; i++)
6226 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6227 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6228 +
6229 +       /*
6230 +        * We are a bit conservative about what we expect.  We have to
6231 +        * know about every hardware change ASAP.
6232 +        */
6233 +       printk(KERN_INFO "testing the IO APIC.......................\n");
6234 +
6235 +       for (apic = 0; apic < nr_ioapics; apic++) {
6236 +
6237 +       spin_lock_irqsave(&ioapic_lock, flags);
6238 +       reg_00.raw = io_apic_read(apic, 0);
6239 +       reg_01.raw = io_apic_read(apic, 1);
6240 +       if (reg_01.bits.version >= 0x10)
6241 +               reg_02.raw = io_apic_read(apic, 2);
6242 +       if (reg_01.bits.version >= 0x20)
6243 +               reg_03.raw = io_apic_read(apic, 3);
6244 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6245 +
6246 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6247 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6248 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
6249 +       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
6250 +       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
6251 +       if (reg_00.bits.ID >= get_physical_broadcast())
6252 +               UNEXPECTED_IO_APIC();
6253 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
6254 +               UNEXPECTED_IO_APIC();
6255 +
6256 +       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
6257 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
6258 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
6259 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
6260 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
6261 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
6262 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
6263 +               (reg_01.bits.entries != 0x2E) &&
6264 +               (reg_01.bits.entries != 0x3F)
6265 +       )
6266 +               UNEXPECTED_IO_APIC();
6267 +
6268 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
6269 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
6270 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
6271 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
6272 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
6273 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
6274 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
6275 +       )
6276 +               UNEXPECTED_IO_APIC();
6277 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
6278 +               UNEXPECTED_IO_APIC();
6279 +
6280 +       /*
6281 +        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
6282 +        * but the value of reg_02 is read as the previous read register
6283 +        * value, so ignore it if reg_02 == reg_01.
6284 +        */
6285 +       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
6286 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
6287 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
6288 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
6289 +                       UNEXPECTED_IO_APIC();
6290 +       }
6291 +
6292 +       /*
6293 +        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
6294 +        * or reg_03, but the value of reg_0[23] is read as the previous read
6295 +        * register value, so ignore it if reg_03 == reg_0[12].
6296 +        */
6297 +       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
6298 +           reg_03.raw != reg_01.raw) {
6299 +               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
6300 +               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
6301 +               if (reg_03.bits.__reserved_1)
6302 +                       UNEXPECTED_IO_APIC();
6303 +       }
6304 +
6305 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
6306 +
6307 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
6308 +                         " Stat Dest Deli Vect:   \n");
6309 +
6310 +       for (i = 0; i <= reg_01.bits.entries; i++) {
6311 +               struct IO_APIC_route_entry entry;
6312 +
6313 +               entry = ioapic_read_entry(apic, i);
6314 +
6315 +               printk(KERN_DEBUG " %02x %03X %02X  ",
6316 +                       i,
6317 +                       entry.dest.logical.logical_dest,
6318 +                       entry.dest.physical.physical_dest
6319 +               );
6320 +
6321 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
6322 +                       entry.mask,
6323 +                       entry.trigger,
6324 +                       entry.irr,
6325 +                       entry.polarity,
6326 +                       entry.delivery_status,
6327 +                       entry.dest_mode,
6328 +                       entry.delivery_mode,
6329 +                       entry.vector
6330 +               );
6331 +       }
6332 +       }
6333 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
6334 +       for (i = 0; i < NR_IRQS; i++) {
6335 +               struct irq_pin_list *entry = irq_2_pin + i;
6336 +               if (entry->pin < 0)
6337 +                       continue;
6338 +               printk(KERN_DEBUG "IRQ%d ", i);
6339 +               for (;;) {
6340 +                       printk("-> %d:%d", entry->apic, entry->pin);
6341 +                       if (!entry->next)
6342 +                               break;
6343 +                       entry = irq_2_pin + entry->next;
6344 +               }
6345 +               printk("\n");
6346 +       }
6347 +
6348 +       printk(KERN_INFO ".................................... done.\n");
6349 +
6350 +       return;
6351 +}
6352 +
6353 +#if 0
6354 +
6355 +static void print_APIC_bitfield (int base)
6356 +{
6357 +       unsigned int v;
6358 +       int i, j;
6359 +
6360 +       if (apic_verbosity == APIC_QUIET)
6361 +               return;
6362 +
6363 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
6364 +       for (i = 0; i < 8; i++) {
6365 +               v = apic_read(base + i*0x10);
6366 +               for (j = 0; j < 32; j++) {
6367 +                       if (v & (1<<j))
6368 +                               printk("1");
6369 +                       else
6370 +                               printk("0");
6371 +               }
6372 +               printk("\n");
6373 +       }
6374 +}
6375 +
6376 +void /*__init*/ print_local_APIC(void * dummy)
6377 +{
6378 +       unsigned int v, ver, maxlvt;
6379 +
6380 +       if (apic_verbosity == APIC_QUIET)
6381 +               return;
6382 +
6383 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6384 +               smp_processor_id(), hard_smp_processor_id());
6385 +       v = apic_read(APIC_ID);
6386 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
6387 +       v = apic_read(APIC_LVR);
6388 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
6389 +       ver = GET_APIC_VERSION(v);
6390 +       maxlvt = get_maxlvt();
6391 +
6392 +       v = apic_read(APIC_TASKPRI);
6393 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
6394 +
6395 +       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
6396 +               v = apic_read(APIC_ARBPRI);
6397 +               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
6398 +                       v & APIC_ARBPRI_MASK);
6399 +               v = apic_read(APIC_PROCPRI);
6400 +               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
6401 +       }
6402 +
6403 +       v = apic_read(APIC_EOI);
6404 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
6405 +       v = apic_read(APIC_RRR);
6406 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
6407 +       v = apic_read(APIC_LDR);
6408 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
6409 +       v = apic_read(APIC_DFR);
6410 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
6411 +       v = apic_read(APIC_SPIV);
6412 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
6413 +
6414 +       printk(KERN_DEBUG "... APIC ISR field:\n");
6415 +       print_APIC_bitfield(APIC_ISR);
6416 +       printk(KERN_DEBUG "... APIC TMR field:\n");
6417 +       print_APIC_bitfield(APIC_TMR);
6418 +       printk(KERN_DEBUG "... APIC IRR field:\n");
6419 +       print_APIC_bitfield(APIC_IRR);
6420 +
6421 +       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
6422 +               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
6423 +                       apic_write(APIC_ESR, 0);
6424 +               v = apic_read(APIC_ESR);
6425 +               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
6426 +       }
6427 +
6428 +       v = apic_read(APIC_ICR);
6429 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
6430 +       v = apic_read(APIC_ICR2);
6431 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
6432 +
6433 +       v = apic_read(APIC_LVTT);
6434 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
6435 +
6436 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
6437 +               v = apic_read(APIC_LVTPC);
6438 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
6439 +       }
6440 +       v = apic_read(APIC_LVT0);
6441 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
6442 +       v = apic_read(APIC_LVT1);
6443 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
6444 +
6445 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
6446 +               v = apic_read(APIC_LVTERR);
6447 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
6448 +       }
6449 +
6450 +       v = apic_read(APIC_TMICT);
6451 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
6452 +       v = apic_read(APIC_TMCCT);
6453 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
6454 +       v = apic_read(APIC_TDCR);
6455 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
6456 +       printk("\n");
6457 +}
6458 +
6459 +void print_all_local_APICs (void)
6460 +{
6461 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
6462 +}
6463 +
6464 +void /*__init*/ print_PIC(void)
6465 +{
6466 +       unsigned int v;
6467 +       unsigned long flags;
6468 +
6469 +       if (apic_verbosity == APIC_QUIET)
6470 +               return;
6471 +
6472 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
6473 +
6474 +       spin_lock_irqsave(&i8259A_lock, flags);
6475 +
6476 +       v = inb(0xa1) << 8 | inb(0x21);
6477 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
6478 +
6479 +       v = inb(0xa0) << 8 | inb(0x20);
6480 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
6481 +
6482 +       outb(0x0b,0xa0);
6483 +       outb(0x0b,0x20);
6484 +       v = inb(0xa0) << 8 | inb(0x20);
6485 +       outb(0x0a,0xa0);
6486 +       outb(0x0a,0x20);
6487 +
6488 +       spin_unlock_irqrestore(&i8259A_lock, flags);
6489 +
6490 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
6491 +
6492 +       v = inb(0x4d1) << 8 | inb(0x4d0);
6493 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
6494 +}
6495 +
6496 +#endif  /*  0  */
6497 +
6498 +#else
6499 +void __init print_IO_APIC(void) { }
6500 +#endif /* !CONFIG_XEN */
6501 +
6502 +static void __init enable_IO_APIC(void)
6503 +{
6504 +       union IO_APIC_reg_01 reg_01;
6505 +       int i8259_apic, i8259_pin;
6506 +       int i, apic;
6507 +       unsigned long flags;
6508 +
6509 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
6510 +               irq_2_pin[i].pin = -1;
6511 +               irq_2_pin[i].next = 0;
6512 +       }
6513 +       if (!pirqs_enabled)
6514 +               for (i = 0; i < MAX_PIRQS; i++)
6515 +                       pirq_entries[i] = -1;
6516 +
6517 +       /*
6518 +        * The number of IO-APIC IRQ registers (== #pins):
6519 +        */
6520 +       for (apic = 0; apic < nr_ioapics; apic++) {
6521 +               spin_lock_irqsave(&ioapic_lock, flags);
6522 +               reg_01.raw = io_apic_read(apic, 1);
6523 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6524 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
6525 +       }
6526 +       for(apic = 0; apic < nr_ioapics; apic++) {
6527 +               int pin;
6528 +               /* See if any of the pins is in ExtINT mode */
6529 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6530 +                       struct IO_APIC_route_entry entry;
6531 +                       entry = ioapic_read_entry(apic, pin);
6532 +
6533 +
6534 +                       /* If the interrupt line is enabled and in ExtInt mode
6535 +                        * I have found the pin where the i8259 is connected.
6536 +                        */
6537 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
6538 +                               ioapic_i8259.apic = apic;
6539 +                               ioapic_i8259.pin  = pin;
6540 +                               goto found_i8259;
6541 +                       }
6542 +               }
6543 +       }
6544 + found_i8259:
6545 +       /* Look to see what if the MP table has reported the ExtINT */
6546 +       /* If we could not find the appropriate pin by looking at the ioapic
6547 +        * the i8259 probably is not connected the ioapic but give the
6548 +        * mptable a chance anyway.
6549 +        */
6550 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
6551 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
6552 +       /* Trust the MP table if nothing is setup in the hardware */
6553 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
6554 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
6555 +               ioapic_i8259.pin  = i8259_pin;
6556 +               ioapic_i8259.apic = i8259_apic;
6557 +       }
6558 +       /* Complain if the MP table and the hardware disagree */
6559 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
6560 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
6561 +       {
6562 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
6563 +       }
6564 +
6565 +       /*
6566 +        * Do not trust the IO-APIC being empty at bootup
6567 +        */
6568 +       clear_IO_APIC();
6569 +}
6570 +
6571 +/*
6572 + * Not an __init, needed by the reboot code
6573 + */
6574 +void disable_IO_APIC(void)
6575 +{
6576 +       /*
6577 +        * Clear the IO-APIC before rebooting:
6578 +        */
6579 +       clear_IO_APIC();
6580 +
6581 +#ifndef CONFIG_XEN
6582 +       /*
6583 +        * If the i8259 is routed through an IOAPIC
6584 +        * Put that IOAPIC in virtual wire mode
6585 +        * so legacy interrupts can be delivered.
6586 +        */
6587 +       if (ioapic_i8259.pin != -1) {
6588 +               struct IO_APIC_route_entry entry;
6589 +
6590 +               memset(&entry, 0, sizeof(entry));
6591 +               entry.mask            = 0; /* Enabled */
6592 +               entry.trigger         = 0; /* Edge */
6593 +               entry.irr             = 0;
6594 +               entry.polarity        = 0; /* High */
6595 +               entry.delivery_status = 0;
6596 +               entry.dest_mode       = 0; /* Physical */
6597 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
6598 +               entry.vector          = 0;
6599 +               entry.dest.physical.physical_dest =
6600 +                                       GET_APIC_ID(apic_read(APIC_ID));
6601 +
6602 +               /*
6603 +                * Add it to the IO-APIC irq-routing table:
6604 +                */
6605 +               ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
6606 +       }
6607 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
6608 +#endif
6609 +}
6610 +
6611 +/*
6612 + * function to set the IO-APIC physical IDs based on the
6613 + * values stored in the MPC table.
6614 + *
6615 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
6616 + */
6617 +
6618 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
6619 +static void __init setup_ioapic_ids_from_mpc(void)
6620 +{
6621 +       union IO_APIC_reg_00 reg_00;
6622 +       physid_mask_t phys_id_present_map;
6623 +       int apic;
6624 +       int i;
6625 +       unsigned char old_id;
6626 +       unsigned long flags;
6627 +
6628 +       /*
6629 +        * Don't check I/O APIC IDs for xAPIC systems.  They have
6630 +        * no meaning without the serial APIC bus.
6631 +        */
6632 +       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
6633 +               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
6634 +               return;
6635 +       /*
6636 +        * This is broken; anything with a real cpu count has to
6637 +        * circumvent this idiocy regardless.
6638 +        */
6639 +       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
6640 +
6641 +       /*
6642 +        * Set the IOAPIC ID to the value stored in the MPC table.
6643 +        */
6644 +       for (apic = 0; apic < nr_ioapics; apic++) {
6645 +
6646 +               /* Read the register 0 value */
6647 +               spin_lock_irqsave(&ioapic_lock, flags);
6648 +               reg_00.raw = io_apic_read(apic, 0);
6649 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6650 +               
6651 +               old_id = mp_ioapics[apic].mpc_apicid;
6652 +
6653 +               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
6654 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
6655 +                               apic, mp_ioapics[apic].mpc_apicid);
6656 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
6657 +                               reg_00.bits.ID);
6658 +                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
6659 +               }
6660 +
6661 +               /*
6662 +                * Sanity check, is the ID really free? Every APIC in a
6663 +                * system must have a unique ID or we get lots of nice
6664 +                * 'stuck on smp_invalidate_needed IPI wait' messages.
6665 +                */
6666 +               if (check_apicid_used(phys_id_present_map,
6667 +                                       mp_ioapics[apic].mpc_apicid)) {
6668 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
6669 +                               apic, mp_ioapics[apic].mpc_apicid);
6670 +                       for (i = 0; i < get_physical_broadcast(); i++)
6671 +                               if (!physid_isset(i, phys_id_present_map))
6672 +                                       break;
6673 +                       if (i >= get_physical_broadcast())
6674 +                               panic("Max APIC ID exceeded!\n");
6675 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
6676 +                               i);
6677 +                       physid_set(i, phys_id_present_map);
6678 +                       mp_ioapics[apic].mpc_apicid = i;
6679 +               } else {
6680 +                       physid_mask_t tmp;
6681 +                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
6682 +                       apic_printk(APIC_VERBOSE, "Setting %d in the "
6683 +                                       "phys_id_present_map\n",
6684 +                                       mp_ioapics[apic].mpc_apicid);
6685 +                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
6686 +               }
6687 +
6688 +
6689 +               /*
6690 +                * We need to adjust the IRQ routing table
6691 +                * if the ID changed.
6692 +                */
6693 +               if (old_id != mp_ioapics[apic].mpc_apicid)
6694 +                       for (i = 0; i < mp_irq_entries; i++)
6695 +                               if (mp_irqs[i].mpc_dstapic == old_id)
6696 +                                       mp_irqs[i].mpc_dstapic
6697 +                                               = mp_ioapics[apic].mpc_apicid;
6698 +
6699 +               /*
6700 +                * Read the right value from the MPC table and
6701 +                * write it into the ID register.
6702 +                */
6703 +               apic_printk(APIC_VERBOSE, KERN_INFO
6704 +                       "...changing IO-APIC physical APIC ID to %d ...",
6705 +                       mp_ioapics[apic].mpc_apicid);
6706 +
6707 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
6708 +               spin_lock_irqsave(&ioapic_lock, flags);
6709 +               io_apic_write(apic, 0, reg_00.raw);
6710 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6711 +
6712 +               /*
6713 +                * Sanity check
6714 +                */
6715 +               spin_lock_irqsave(&ioapic_lock, flags);
6716 +               reg_00.raw = io_apic_read(apic, 0);
6717 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6718 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
6719 +                       printk("could not set ID!\n");
6720 +               else
6721 +                       apic_printk(APIC_VERBOSE, " ok.\n");
6722 +       }
6723 +}
6724 +#else
6725 +static void __init setup_ioapic_ids_from_mpc(void) { }
6726 +#endif
6727 +
6728 +#ifndef CONFIG_XEN
6729 +/*
6730 + * There is a nasty bug in some older SMP boards, their mptable lies
6731 + * about the timer IRQ. We do the following to work around the situation:
6732 + *
6733 + *     - timer IRQ defaults to IO-APIC IRQ
6734 + *     - if this function detects that timer IRQs are defunct, then we fall
6735 + *       back to ISA timer IRQs
6736 + */
6737 +static int __init timer_irq_works(void)
6738 +{
6739 +       unsigned long t1 = jiffies;
6740 +
6741 +       local_irq_enable();
6742 +       /* Let ten ticks pass... */
6743 +       mdelay((10 * 1000) / HZ);
6744 +
6745 +       /*
6746 +        * Expect a few ticks at least, to be sure some possible
6747 +        * glue logic does not lock up after one or two first
6748 +        * ticks in a non-ExtINT mode.  Also the local APIC
6749 +        * might have cached one ExtINT interrupt.  Finally, at
6750 +        * least one tick may be lost due to delays.
6751 +        */
6752 +       if (jiffies - t1 > 4)
6753 +               return 1;
6754 +
6755 +       return 0;
6756 +}
6757 +
6758 +/*
6759 + * In the SMP+IOAPIC case it might happen that there are an unspecified
6760 + * number of pending IRQ events unhandled. These cases are very rare,
6761 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
6762 + * better to do it this way as thus we do not have to be aware of
6763 + * 'pending' interrupts in the IRQ path, except at this point.
6764 + */
6765 +/*
6766 + * Edge triggered needs to resend any interrupt
6767 + * that was delayed but this is now handled in the device
6768 + * independent code.
6769 + */
6770 +
6771 +/*
6772 + * Startup quirk:
6773 + *
6774 + * Starting up a edge-triggered IO-APIC interrupt is
6775 + * nasty - we need to make sure that we get the edge.
6776 + * If it is already asserted for some reason, we need
6777 + * return 1 to indicate that is was pending.
6778 + *
6779 + * This is not complete - we should be able to fake
6780 + * an edge even if it isn't on the 8259A...
6781 + *
6782 + * (We do this for level-triggered IRQs too - it cannot hurt.)
6783 + */
6784 +static unsigned int startup_ioapic_irq(unsigned int irq)
6785 +{
6786 +       int was_pending = 0;
6787 +       unsigned long flags;
6788 +
6789 +       spin_lock_irqsave(&ioapic_lock, flags);
6790 +       if (irq < 16) {
6791 +               disable_8259A_irq(irq);
6792 +               if (i8259A_irq_pending(irq))
6793 +                       was_pending = 1;
6794 +       }
6795 +       __unmask_IO_APIC_irq(irq);
6796 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6797 +
6798 +       return was_pending;
6799 +}
6800 +
6801 +static void ack_ioapic_irq(unsigned int irq)
6802 +{
6803 +       move_native_irq(irq);
6804 +       ack_APIC_irq();
6805 +}
6806 +
6807 +static void ack_ioapic_quirk_irq(unsigned int irq)
6808 +{
6809 +       unsigned long v;
6810 +       int i;
6811 +
6812 +       move_native_irq(irq);
6813 +/*
6814 + * It appears there is an erratum which affects at least version 0x11
6815 + * of I/O APIC (that's the 82093AA and cores integrated into various
6816 + * chipsets).  Under certain conditions a level-triggered interrupt is
6817 + * erroneously delivered as edge-triggered one but the respective IRR
6818 + * bit gets set nevertheless.  As a result the I/O unit expects an EOI
6819 + * message but it will never arrive and further interrupts are blocked
6820 + * from the source.  The exact reason is so far unknown, but the
6821 + * phenomenon was observed when two consecutive interrupt requests
6822 + * from a given source get delivered to the same CPU and the source is
6823 + * temporarily disabled in between.
6824 + *
6825 + * A workaround is to simulate an EOI message manually.  We achieve it
6826 + * by setting the trigger mode to edge and then to level when the edge
6827 + * trigger mode gets detected in the TMR of a local APIC for a
6828 + * level-triggered interrupt.  We mask the source for the time of the
6829 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
6830 + * The idea is from Manfred Spraul.  --macro
6831 + */
6832 +       i = irq_vector[irq];
6833 +
6834 +       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
6835 +
6836 +       ack_APIC_irq();
6837 +
6838 +       if (!(v & (1 << (i & 0x1f)))) {
6839 +               atomic_inc(&irq_mis_count);
6840 +               spin_lock(&ioapic_lock);
6841 +               __mask_and_edge_IO_APIC_irq(irq);
6842 +               __unmask_and_level_IO_APIC_irq(irq);
6843 +               spin_unlock(&ioapic_lock);
6844 +       }
6845 +}
6846 +
6847 +static int ioapic_retrigger_irq(unsigned int irq)
6848 +{
6849 +       send_IPI_self(irq_vector[irq]);
6850 +
6851 +       return 1;
6852 +}
6853 +
6854 +static struct irq_chip ioapic_chip __read_mostly = {
6855 +       .name           = "IO-APIC",
6856 +       .startup        = startup_ioapic_irq,
6857 +       .mask           = mask_IO_APIC_irq,
6858 +       .unmask         = unmask_IO_APIC_irq,
6859 +       .ack            = ack_ioapic_irq,
6860 +       .eoi            = ack_ioapic_quirk_irq,
6861 +#ifdef CONFIG_SMP
6862 +       .set_affinity   = set_ioapic_affinity_irq,
6863 +#endif
6864 +       .retrigger      = ioapic_retrigger_irq,
6865 +};
6866 +
6867 +#endif /* !CONFIG_XEN */
6868 +
6869 +static inline void init_IO_APIC_traps(void)
6870 +{
6871 +       int irq;
6872 +
6873 +       /*
6874 +        * NOTE! The local APIC isn't very good at handling
6875 +        * multiple interrupts at the same interrupt level.
6876 +        * As the interrupt level is determined by taking the
6877 +        * vector number and shifting that right by 4, we
6878 +        * want to spread these out a bit so that they don't
6879 +        * all fall in the same interrupt level.
6880 +        *
6881 +        * Also, we've got to be careful not to trash gate
6882 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
6883 +        */
6884 +       for (irq = 0; irq < NR_IRQS ; irq++) {
6885 +               int tmp = irq;
6886 +               if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
6887 +                       /*
6888 +                        * Hmm.. We don't have an entry for this,
6889 +                        * so default to an old-fashioned 8259
6890 +                        * interrupt if we can..
6891 +                        */
6892 +                       if (irq < 16)
6893 +                               make_8259A_irq(irq);
6894 +#ifndef CONFIG_XEN
6895 +                       else
6896 +                               /* Strange. Oh, well.. */
6897 +                               irq_desc[irq].chip = &no_irq_chip;
6898 +#endif
6899 +               }
6900 +       }
6901 +}
6902 +
6903 +#ifndef CONFIG_XEN
6904 +/*
6905 + * The local APIC irq-chip implementation:
6906 + */
6907 +
6908 +static void ack_apic(unsigned int irq)
6909 +{
6910 +       ack_APIC_irq();
6911 +}
6912 +
6913 +static void mask_lapic_irq (unsigned int irq)
6914 +{
6915 +       unsigned long v;
6916 +
6917 +       v = apic_read(APIC_LVT0);
6918 +       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
6919 +}
6920 +
6921 +static void unmask_lapic_irq (unsigned int irq)
6922 +{
6923 +       unsigned long v;
6924 +
6925 +       v = apic_read(APIC_LVT0);
6926 +       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
6927 +}
6928 +
6929 +static struct irq_chip lapic_chip __read_mostly = {
6930 +       .name           = "local-APIC-edge",
6931 +       .mask           = mask_lapic_irq,
6932 +       .unmask         = unmask_lapic_irq,
6933 +       .eoi            = ack_apic,
6934 +};
6935 +
6936 +static void setup_nmi (void)
6937 +{
6938 +       /*
6939 +        * Dirty trick to enable the NMI watchdog ...
6940 +        * We put the 8259A master into AEOI mode and
6941 +        * unmask on all local APICs LVT0 as NMI.
6942 +        *
6943 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
6944 +        * is from Maciej W. Rozycki - so we do not have to EOI from
6945 +        * the NMI handler or the timer interrupt.
6946 +        */ 
6947 +       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
6948 +
6949 +       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
6950 +
6951 +       apic_printk(APIC_VERBOSE, " done.\n");
6952 +}
6953 +
6954 +/*
6955 + * This looks a bit hackish but it's about the only one way of sending
6956 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
6957 + * not support the ExtINT mode, unfortunately.  We need to send these
6958 + * cycles as some i82489DX-based boards have glue logic that keeps the
6959 + * 8259A interrupt line asserted until INTA.  --macro
6960 + */
6961 +static inline void unlock_ExtINT_logic(void)
6962 +{
6963 +       int apic, pin, i;
6964 +       struct IO_APIC_route_entry entry0, entry1;
6965 +       unsigned char save_control, save_freq_select;
6966 +
6967 +       pin  = find_isa_irq_pin(8, mp_INT);
6968 +       apic = find_isa_irq_apic(8, mp_INT);
6969 +       if (pin == -1)
6970 +               return;
6971 +
6972 +       entry0 = ioapic_read_entry(apic, pin);
6973 +       clear_IO_APIC_pin(apic, pin);
6974 +
6975 +       memset(&entry1, 0, sizeof(entry1));
6976 +
6977 +       entry1.dest_mode = 0;                   /* physical delivery */
6978 +       entry1.mask = 0;                        /* unmask IRQ now */
6979 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
6980 +       entry1.delivery_mode = dest_ExtINT;
6981 +       entry1.polarity = entry0.polarity;
6982 +       entry1.trigger = 0;
6983 +       entry1.vector = 0;
6984 +
6985 +       ioapic_write_entry(apic, pin, entry1);
6986 +
6987 +       save_control = CMOS_READ(RTC_CONTROL);
6988 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
6989 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
6990 +                  RTC_FREQ_SELECT);
6991 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
6992 +
6993 +       i = 100;
6994 +       while (i-- > 0) {
6995 +               mdelay(10);
6996 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
6997 +                       i -= 10;
6998 +       }
6999 +
7000 +       CMOS_WRITE(save_control, RTC_CONTROL);
7001 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
7002 +       clear_IO_APIC_pin(apic, pin);
7003 +
7004 +       ioapic_write_entry(apic, pin, entry0);
7005 +}
7006 +#endif /* !CONFIG_XEN */
7007 +
7008 +int timer_uses_ioapic_pin_0;
7009 +
7010 +#ifndef CONFIG_XEN
7011 +/*
7012 + * This code may look a bit paranoid, but it's supposed to cooperate with
7013 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
7014 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
7015 + * fanatically on his truly buggy board.
7016 + */
7017 +static inline void check_timer(void)
7018 +{
7019 +       int apic1, pin1, apic2, pin2;
7020 +       int vector;
7021 +
7022 +       /*
7023 +        * get/set the timer IRQ vector:
7024 +        */
7025 +       disable_8259A_irq(0);
7026 +       vector = assign_irq_vector(0);
7027 +       set_intr_gate(vector, interrupt[0]);
7028 +
7029 +       /*
7030 +        * Subtle, code in do_timer_interrupt() expects an AEOI
7031 +        * mode for the 8259A whenever interrupts are routed
7032 +        * through I/O APICs.  Also IRQ0 has to be enabled in
7033 +        * the 8259A which implies the virtual wire has to be
7034 +        * disabled in the local APIC.
7035 +        */
7036 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7037 +       init_8259A(1);
7038 +       timer_ack = 1;
7039 +       if (timer_over_8254 > 0)
7040 +               enable_8259A_irq(0);
7041 +
7042 +       pin1  = find_isa_irq_pin(0, mp_INT);
7043 +       apic1 = find_isa_irq_apic(0, mp_INT);
7044 +       pin2  = ioapic_i8259.pin;
7045 +       apic2 = ioapic_i8259.apic;
7046 +
7047 +       if (pin1 == 0)
7048 +               timer_uses_ioapic_pin_0 = 1;
7049 +
7050 +       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7051 +               vector, apic1, pin1, apic2, pin2);
7052 +
7053 +       if (pin1 != -1) {
7054 +               /*
7055 +                * Ok, does IRQ0 through the IOAPIC work?
7056 +                */
7057 +               unmask_IO_APIC_irq(0);
7058 +               if (timer_irq_works()) {
7059 +                       if (nmi_watchdog == NMI_IO_APIC) {
7060 +                               disable_8259A_irq(0);
7061 +                               setup_nmi();
7062 +                               enable_8259A_irq(0);
7063 +                       }
7064 +                       if (disable_timer_pin_1 > 0)
7065 +                               clear_IO_APIC_pin(0, pin1);
7066 +                       return;
7067 +               }
7068 +               clear_IO_APIC_pin(apic1, pin1);
7069 +               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7070 +                               "IO-APIC\n");
7071 +       }
7072 +
7073 +       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7074 +       if (pin2 != -1) {
7075 +               printk("\n..... (found pin %d) ...", pin2);
7076 +               /*
7077 +                * legacy devices should be connected to IO APIC #0
7078 +                */
7079 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7080 +               if (timer_irq_works()) {
7081 +                       printk("works.\n");
7082 +                       if (pin1 != -1)
7083 +                               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7084 +                       else
7085 +                               add_pin_to_irq(0, apic2, pin2);
7086 +                       if (nmi_watchdog == NMI_IO_APIC) {
7087 +                               setup_nmi();
7088 +                       }
7089 +                       return;
7090 +               }
7091 +               /*
7092 +                * Cleanup, just in case ...
7093 +                */
7094 +               clear_IO_APIC_pin(apic2, pin2);
7095 +       }
7096 +       printk(" failed.\n");
7097 +
7098 +       if (nmi_watchdog == NMI_IO_APIC) {
7099 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7100 +               nmi_watchdog = 0;
7101 +       }
7102 +
7103 +       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7104 +
7105 +       disable_8259A_irq(0);
7106 +       set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7107 +                                     "fasteio");
7108 +       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
7109 +       enable_8259A_irq(0);
7110 +
7111 +       if (timer_irq_works()) {
7112 +               printk(" works.\n");
7113 +               return;
7114 +       }
7115 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7116 +       printk(" failed.\n");
7117 +
7118 +       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7119 +
7120 +       timer_ack = 0;
7121 +       init_8259A(0);
7122 +       make_8259A_irq(0);
7123 +       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7124 +
7125 +       unlock_ExtINT_logic();
7126 +
7127 +       if (timer_irq_works()) {
7128 +               printk(" works.\n");
7129 +               return;
7130 +       }
7131 +       printk(" failed :(.\n");
7132 +       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
7133 +               "report.  Then try booting with the 'noapic' option");
7134 +}
7135 +#else
7136 +#define check_timer() ((void)0)
7137 +#endif /* CONFIG_XEN */
7138 +
7139 +/*
7140 + *
7141 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7142 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7143 + *   Linux doesn't really care, as it's not actually used
7144 + *   for any interrupt handling anyway.
7145 + */
7146 +#define PIC_IRQS       (1 << PIC_CASCADE_IR)
7147 +
7148 +void __init setup_IO_APIC(void)
7149 +{
7150 +       enable_IO_APIC();
7151 +
7152 +       if (acpi_ioapic)
7153 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
7154 +       else
7155 +               io_apic_irqs = ~PIC_IRQS;
7156 +
7157 +       printk("ENABLING IO-APIC IRQs\n");
7158 +
7159 +       /*
7160 +        * Set up IO-APIC IRQ routing.
7161 +        */
7162 +       if (!acpi_ioapic)
7163 +               setup_ioapic_ids_from_mpc();
7164 +#ifndef CONFIG_XEN
7165 +       sync_Arb_IDs();
7166 +#endif
7167 +       setup_IO_APIC_irqs();
7168 +       init_IO_APIC_traps();
7169 +       check_timer();
7170 +       if (!acpi_ioapic)
7171 +               print_IO_APIC();
7172 +}
7173 +
7174 +static int __init setup_disable_8254_timer(char *s)
7175 +{
7176 +       timer_over_8254 = -1;
7177 +       return 1;
7178 +}
7179 +static int __init setup_enable_8254_timer(char *s)
7180 +{
7181 +       timer_over_8254 = 2;
7182 +       return 1;
7183 +}
7184 +
7185 +__setup("disable_8254_timer", setup_disable_8254_timer);
7186 +__setup("enable_8254_timer", setup_enable_8254_timer);
7187 +
7188 +/*
7189 + *     Called after all the initialization is done. If we didnt find any
7190 + *     APIC bugs then we can allow the modify fast path
7191 + */
7192
7193 +static int __init io_apic_bug_finalize(void)
7194 +{
7195 +       if(sis_apic_bug == -1)
7196 +               sis_apic_bug = 0;
7197 +       if (is_initial_xendomain()) {
7198 +               dom0_op_t op = { .cmd = DOM0_PLATFORM_QUIRK };
7199 +               op.u.platform_quirk.quirk_id = sis_apic_bug ?
7200 +                       QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
7201 +               HYPERVISOR_dom0_op(&op);
7202 +       }
7203 +       return 0;
7204 +}
7205 +
7206 +late_initcall(io_apic_bug_finalize);
7207 +
7208 +struct sysfs_ioapic_data {
7209 +       struct sys_device dev;
7210 +       struct IO_APIC_route_entry entry[0];
7211 +};
7212 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7213 +
7214 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7215 +{
7216 +       struct IO_APIC_route_entry *entry;
7217 +       struct sysfs_ioapic_data *data;
7218 +       int i;
7219 +       
7220 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
7221 +       entry = data->entry;
7222 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7223 +               entry[i] = ioapic_read_entry(dev->id, i);
7224 +
7225 +       return 0;
7226 +}
7227 +
7228 +static int ioapic_resume(struct sys_device *dev)
7229 +{
7230 +       struct IO_APIC_route_entry *entry;
7231 +       struct sysfs_ioapic_data *data;
7232 +       unsigned long flags;
7233 +       union IO_APIC_reg_00 reg_00;
7234 +       int i;
7235 +       
7236 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
7237 +       entry = data->entry;
7238 +
7239 +       spin_lock_irqsave(&ioapic_lock, flags);
7240 +       reg_00.raw = io_apic_read(dev->id, 0);
7241 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7242 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7243 +               io_apic_write(dev->id, 0, reg_00.raw);
7244 +       }
7245 +       spin_unlock_irqrestore(&ioapic_lock, flags);
7246 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7247 +               ioapic_write_entry(dev->id, i, entry[i]);
7248 +
7249 +       return 0;
7250 +}
7251 +
7252 +static struct sysdev_class ioapic_sysdev_class = {
7253 +       set_kset_name("ioapic"),
7254 +       .suspend = ioapic_suspend,
7255 +       .resume = ioapic_resume,
7256 +};
7257 +
7258 +static int __init ioapic_init_sysfs(void)
7259 +{
7260 +       struct sys_device * dev;
7261 +       int i, size, error = 0;
7262 +
7263 +       error = sysdev_class_register(&ioapic_sysdev_class);
7264 +       if (error)
7265 +               return error;
7266 +
7267 +       for (i = 0; i < nr_ioapics; i++ ) {
7268 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
7269 +                       * sizeof(struct IO_APIC_route_entry);
7270 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7271 +               if (!mp_ioapic_data[i]) {
7272 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7273 +                       continue;
7274 +               }
7275 +               memset(mp_ioapic_data[i], 0, size);
7276 +               dev = &mp_ioapic_data[i]->dev;
7277 +               dev->id = i; 
7278 +               dev->cls = &ioapic_sysdev_class;
7279 +               error = sysdev_register(dev);
7280 +               if (error) {
7281 +                       kfree(mp_ioapic_data[i]);
7282 +                       mp_ioapic_data[i] = NULL;
7283 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7284 +                       continue;
7285 +               }
7286 +       }
7287 +
7288 +       return 0;
7289 +}
7290 +
7291 +device_initcall(ioapic_init_sysfs);
7292 +
7293 +/*
7294 + * Dynamic irq allocate and deallocation
7295 + */
7296 +int create_irq(void)
7297 +{
7298 +       /* Allocate an unused irq */
7299 +       int irq, new, vector;
7300 +       unsigned long flags;
7301 +
7302 +       irq = -ENOSPC;
7303 +       spin_lock_irqsave(&vector_lock, flags);
7304 +       for (new = (NR_IRQS - 1); new >= 0; new--) {
7305 +               if (platform_legacy_irq(new))
7306 +                       continue;
7307 +               if (irq_vector[new] != 0)
7308 +                       continue;
7309 +               vector = __assign_irq_vector(new);
7310 +               if (likely(vector > 0))
7311 +                       irq = new;
7312 +               break;
7313 +       }
7314 +       spin_unlock_irqrestore(&vector_lock, flags);
7315 +
7316 +       if (irq >= 0) {
7317 +#ifndef CONFIG_XEN
7318 +               set_intr_gate(vector, interrupt[irq]);
7319 +#endif
7320 +               dynamic_irq_init(irq);
7321 +       }
7322 +       return irq;
7323 +}
7324 +
7325 +void destroy_irq(unsigned int irq)
7326 +{
7327 +       unsigned long flags;
7328 +
7329 +       dynamic_irq_cleanup(irq);
7330 +
7331 +       spin_lock_irqsave(&vector_lock, flags);
7332 +       irq_vector[irq] = 0;
7333 +       spin_unlock_irqrestore(&vector_lock, flags);
7334 +}
7335 +
7336 +/*
7337 + * MSI mesage composition
7338 + */
7339 +#ifdef CONFIG_PCI_MSI
7340 +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
7341 +{
7342 +       int vector;
7343 +       unsigned dest;
7344 +
7345 +       vector = assign_irq_vector(irq);
7346 +       if (vector >= 0) {
7347 +               dest = cpu_mask_to_apicid(TARGET_CPUS);
7348 +
7349 +               msg->address_hi = MSI_ADDR_BASE_HI;
7350 +               msg->address_lo =
7351 +                       MSI_ADDR_BASE_LO |
7352 +                       ((INT_DEST_MODE == 0) ?
7353 +                               MSI_ADDR_DEST_MODE_PHYSICAL:
7354 +                               MSI_ADDR_DEST_MODE_LOGICAL) |
7355 +                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7356 +                               MSI_ADDR_REDIRECTION_CPU:
7357 +                               MSI_ADDR_REDIRECTION_LOWPRI) |
7358 +                       MSI_ADDR_DEST_ID(dest);
7359 +
7360 +               msg->data =
7361 +                       MSI_DATA_TRIGGER_EDGE |
7362 +                       MSI_DATA_LEVEL_ASSERT |
7363 +                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7364 +                               MSI_DATA_DELIVERY_FIXED:
7365 +                               MSI_DATA_DELIVERY_LOWPRI) |
7366 +                       MSI_DATA_VECTOR(vector);
7367 +       }
7368 +       return vector;
7369 +}
7370 +
7371 +#ifdef CONFIG_SMP
7372 +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
7373 +{
7374 +       struct msi_msg msg;
7375 +       unsigned int dest;
7376 +       cpumask_t tmp;
7377 +       int vector;
7378 +
7379 +       cpus_and(tmp, mask, cpu_online_map);
7380 +       if (cpus_empty(tmp))
7381 +               tmp = TARGET_CPUS;
7382 +
7383 +       vector = assign_irq_vector(irq);
7384 +       if (vector < 0)
7385 +               return;
7386 +
7387 +       dest = cpu_mask_to_apicid(mask);
7388 +
7389 +       read_msi_msg(irq, &msg);
7390 +
7391 +       msg.data &= ~MSI_DATA_VECTOR_MASK;
7392 +       msg.data |= MSI_DATA_VECTOR(vector);
7393 +       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
7394 +       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
7395 +
7396 +       write_msi_msg(irq, &msg);
7397 +       set_native_irq_info(irq, mask);
7398 +}
7399 +#endif /* CONFIG_SMP */
7400 +
7401 +/*
7402 + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
7403 + * which implement the MSI or MSI-X Capability Structure.
7404 + */
7405 +static struct irq_chip msi_chip = {
7406 +       .name           = "PCI-MSI",
7407 +       .unmask         = unmask_msi_irq,
7408 +       .mask           = mask_msi_irq,
7409 +       .ack            = ack_ioapic_irq,
7410 +#ifdef CONFIG_SMP
7411 +       .set_affinity   = set_msi_irq_affinity,
7412 +#endif
7413 +       .retrigger      = ioapic_retrigger_irq,
7414 +};
7415 +
7416 +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
7417 +{
7418 +       struct msi_msg msg;
7419 +       int ret;
7420 +       ret = msi_compose_msg(dev, irq, &msg);
7421 +       if (ret < 0)
7422 +               return ret;
7423 +
7424 +       write_msi_msg(irq, &msg);
7425 +
7426 +       set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
7427 +                                     "edge");
7428 +
7429 +       return 0;
7430 +}
7431 +
7432 +void arch_teardown_msi_irq(unsigned int irq)
7433 +{
7434 +       return;
7435 +}
7436 +
7437 +#endif /* CONFIG_PCI_MSI */
7438 +
7439 +/*
7440 + * Hypertransport interrupt support
7441 + */
7442 +#ifdef CONFIG_HT_IRQ
7443 +
7444 +#ifdef CONFIG_SMP
7445 +
7446 +static void target_ht_irq(unsigned int irq, unsigned int dest)
7447 +{
7448 +       struct ht_irq_msg msg;
7449 +       fetch_ht_irq_msg(irq, &msg);
7450 +
7451 +       msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
7452 +       msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
7453 +
7454 +       msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
7455 +       msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
7456 +
7457 +       write_ht_irq_msg(irq, &msg);
7458 +}
7459 +
7460 +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
7461 +{
7462 +       unsigned int dest;
7463 +       cpumask_t tmp;
7464 +
7465 +       cpus_and(tmp, mask, cpu_online_map);
7466 +       if (cpus_empty(tmp))
7467 +               tmp = TARGET_CPUS;
7468 +
7469 +       cpus_and(mask, tmp, CPU_MASK_ALL);
7470 +
7471 +       dest = cpu_mask_to_apicid(mask);
7472 +
7473 +       target_ht_irq(irq, dest);
7474 +       set_native_irq_info(irq, mask);
7475 +}
7476 +#endif
7477 +
7478 +static struct irq_chip ht_irq_chip = {
7479 +       .name           = "PCI-HT",
7480 +       .mask           = mask_ht_irq,
7481 +       .unmask         = unmask_ht_irq,
7482 +       .ack            = ack_ioapic_irq,
7483 +#ifdef CONFIG_SMP
7484 +       .set_affinity   = set_ht_irq_affinity,
7485 +#endif
7486 +       .retrigger      = ioapic_retrigger_irq,
7487 +};
7488 +
7489 +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
7490 +{
7491 +       int vector;
7492 +
7493 +       vector = assign_irq_vector(irq);
7494 +       if (vector >= 0) {
7495 +               struct ht_irq_msg msg;
7496 +               unsigned dest;
7497 +               cpumask_t tmp;
7498 +
7499 +               cpus_clear(tmp);
7500 +               cpu_set(vector >> 8, tmp);
7501 +               dest = cpu_mask_to_apicid(tmp);
7502 +
7503 +               msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
7504 +
7505 +               msg.address_lo =
7506 +                       HT_IRQ_LOW_BASE |
7507 +                       HT_IRQ_LOW_DEST_ID(dest) |
7508 +                       HT_IRQ_LOW_VECTOR(vector) |
7509 +                       ((INT_DEST_MODE == 0) ?
7510 +                               HT_IRQ_LOW_DM_PHYSICAL :
7511 +                               HT_IRQ_LOW_DM_LOGICAL) |
7512 +                       HT_IRQ_LOW_RQEOI_EDGE |
7513 +                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7514 +                               HT_IRQ_LOW_MT_FIXED :
7515 +                               HT_IRQ_LOW_MT_ARBITRATED) |
7516 +                       HT_IRQ_LOW_IRQ_MASKED;
7517 +
7518 +               write_ht_irq_msg(irq, &msg);
7519 +
7520 +               set_irq_chip_and_handler_name(irq, &ht_irq_chip,
7521 +                                             handle_edge_irq, "edge");
7522 +       }
7523 +       return vector;
7524 +}
7525 +#endif /* CONFIG_HT_IRQ */
7526 +
7527 +/* --------------------------------------------------------------------------
7528 +                          ACPI-based IOAPIC Configuration
7529 +   -------------------------------------------------------------------------- */
7530 +
7531 +#ifdef CONFIG_ACPI
7532 +
7533 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
7534 +{
7535 +#ifndef CONFIG_XEN
7536 +       union IO_APIC_reg_00 reg_00;
7537 +       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
7538 +       physid_mask_t tmp;
7539 +       unsigned long flags;
7540 +       int i = 0;
7541 +
7542 +       /*
7543 +        * The P4 platform supports up to 256 APIC IDs on two separate APIC 
7544 +        * buses (one for LAPICs, one for IOAPICs), where predecessors only 
7545 +        * supports up to 16 on one shared APIC bus.
7546 +        * 
7547 +        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7548 +        *      advantage of new APIC bus architecture.
7549 +        */
7550 +
7551 +       if (physids_empty(apic_id_map))
7552 +               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
7553 +
7554 +       spin_lock_irqsave(&ioapic_lock, flags);
7555 +       reg_00.raw = io_apic_read(ioapic, 0);
7556 +       spin_unlock_irqrestore(&ioapic_lock, flags);
7557 +
7558 +       if (apic_id >= get_physical_broadcast()) {
7559 +               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
7560 +                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
7561 +               apic_id = reg_00.bits.ID;
7562 +       }
7563 +
7564 +       /*
7565 +        * Every APIC in a system must have a unique ID or we get lots of nice 
7566 +        * 'stuck on smp_invalidate_needed IPI wait' messages.
7567 +        */
7568 +       if (check_apicid_used(apic_id_map, apic_id)) {
7569 +
7570 +               for (i = 0; i < get_physical_broadcast(); i++) {
7571 +                       if (!check_apicid_used(apic_id_map, i))
7572 +                               break;
7573 +               }
7574 +
7575 +               if (i == get_physical_broadcast())
7576 +                       panic("Max apic_id exceeded!\n");
7577 +
7578 +               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
7579 +                       "trying %d\n", ioapic, apic_id, i);
7580 +
7581 +               apic_id = i;
7582 +       } 
7583 +
7584 +       tmp = apicid_to_cpu_present(apic_id);
7585 +       physids_or(apic_id_map, apic_id_map, tmp);
7586 +
7587 +       if (reg_00.bits.ID != apic_id) {
7588 +               reg_00.bits.ID = apic_id;
7589 +
7590 +               spin_lock_irqsave(&ioapic_lock, flags);
7591 +               io_apic_write(ioapic, 0, reg_00.raw);
7592 +               reg_00.raw = io_apic_read(ioapic, 0);
7593 +               spin_unlock_irqrestore(&ioapic_lock, flags);
7594 +
7595 +               /* Sanity check */
7596 +               if (reg_00.bits.ID != apic_id) {
7597 +                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
7598 +                       return -1;
7599 +               }
7600 +       }
7601 +
7602 +       apic_printk(APIC_VERBOSE, KERN_INFO
7603 +                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
7604 +#endif /* !CONFIG_XEN */
7605 +
7606 +       return apic_id;
7607 +}
7608 +
7609 +
7610 +int __init io_apic_get_version (int ioapic)
7611 +{
7612 +       union IO_APIC_reg_01    reg_01;
7613 +       unsigned long flags;
7614 +
7615 +       spin_lock_irqsave(&ioapic_lock, flags);
7616 +       reg_01.raw = io_apic_read(ioapic, 1);
7617 +       spin_unlock_irqrestore(&ioapic_lock, flags);
7618 +
7619 +       return reg_01.bits.version;
7620 +}
7621 +
7622 +
7623 +int __init io_apic_get_redir_entries (int ioapic)
7624 +{
7625 +       union IO_APIC_reg_01    reg_01;
7626 +       unsigned long flags;
7627 +
7628 +       spin_lock_irqsave(&ioapic_lock, flags);
7629 +       reg_01.raw = io_apic_read(ioapic, 1);
7630 +       spin_unlock_irqrestore(&ioapic_lock, flags);
7631 +
7632 +       return reg_01.bits.entries;
7633 +}
7634 +
7635 +
7636 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7637 +{
7638 +       struct IO_APIC_route_entry entry;
7639 +       unsigned long flags;
7640 +
7641 +       if (!IO_APIC_IRQ(irq)) {
7642 +               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
7643 +                       ioapic);
7644 +               return -EINVAL;
7645 +       }
7646 +
7647 +       /*
7648 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
7649 +        * Note that we mask (disable) IRQs now -- these get enabled when the
7650 +        * corresponding device driver registers for this IRQ.
7651 +        */
7652 +
7653 +       memset(&entry,0,sizeof(entry));
7654 +
7655 +       entry.delivery_mode = INT_DELIVERY_MODE;
7656 +       entry.dest_mode = INT_DEST_MODE;
7657 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
7658 +       entry.trigger = edge_level;
7659 +       entry.polarity = active_high_low;
7660 +       entry.mask  = 1;
7661 +
7662 +       /*
7663 +        * IRQs < 16 are already in the irq_2_pin[] map
7664 +        */
7665 +       if (irq >= 16)
7666 +               add_pin_to_irq(irq, ioapic, pin);
7667 +
7668 +       entry.vector = assign_irq_vector(irq);
7669 +
7670 +       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7671 +               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7672 +               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7673 +               edge_level, active_high_low);
7674 +
7675 +       ioapic_register_intr(irq, entry.vector, edge_level);
7676 +
7677 +       if (!ioapic && (irq < 16))
7678 +               disable_8259A_irq(irq);
7679 +
7680 +       ioapic_write_entry(ioapic, pin, entry);
7681 +       spin_lock_irqsave(&ioapic_lock, flags);
7682 +       set_native_irq_info(irq, TARGET_CPUS);
7683 +       spin_unlock_irqrestore(&ioapic_lock, flags);
7684 +
7685 +       return 0;
7686 +}
7687 +
7688 +#endif /* CONFIG_ACPI */
7689 +
7690 +static int __init parse_disable_timer_pin_1(char *arg)
7691 +{
7692 +       disable_timer_pin_1 = 1;
7693 +       return 0;
7694 +}
7695 +early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
7696 +
7697 +static int __init parse_enable_timer_pin_1(char *arg)
7698 +{
7699 +       disable_timer_pin_1 = -1;
7700 +       return 0;
7701 +}
7702 +early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
7703 +
7704 +static int __init parse_noapic(char *arg)
7705 +{
7706 +       /* disable IO-APIC */
7707 +       disable_ioapic_setup();
7708 +       return 0;
7709 +}
7710 +early_param("noapic", parse_noapic);
7711 diff -ruNp linux-2.6.19/arch/i386/kernel/ioport-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/ioport-xen.c
7712 --- linux-2.6.19/arch/i386/kernel/ioport-xen.c  1970-01-01 00:00:00.000000000 +0000
7713 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/ioport-xen.c        2007-02-02 19:10:21.000000000 +0000
7714 @@ -0,0 +1,121 @@
7715 +/*
7716 + *     linux/arch/i386/kernel/ioport.c
7717 + *
7718 + * This contains the io-permission bitmap code - written by obz, with changes
7719 + * by Linus.
7720 + */
7721 +
7722 +#include <linux/sched.h>
7723 +#include <linux/kernel.h>
7724 +#include <linux/capability.h>
7725 +#include <linux/errno.h>
7726 +#include <linux/types.h>
7727 +#include <linux/ioport.h>
7728 +#include <linux/smp.h>
7729 +#include <linux/smp_lock.h>
7730 +#include <linux/stddef.h>
7731 +#include <linux/slab.h>
7732 +#include <linux/thread_info.h>
7733 +#include <xen/interface/physdev.h>
7734 +
7735 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
7736 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
7737 +{
7738 +       unsigned long mask;
7739 +       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
7740 +       unsigned int low_index = base & (BITS_PER_LONG-1);
7741 +       int length = low_index + extent;
7742 +
7743 +       if (low_index != 0) {
7744 +               mask = (~0UL << low_index);
7745 +               if (length < BITS_PER_LONG)
7746 +                       mask &= ~(~0UL << length);
7747 +               if (new_value)
7748 +                       *bitmap_base++ |= mask;
7749 +               else
7750 +                       *bitmap_base++ &= ~mask;
7751 +               length -= BITS_PER_LONG;
7752 +       }
7753 +
7754 +       mask = (new_value ? ~0UL : 0UL);
7755 +       while (length >= BITS_PER_LONG) {
7756 +               *bitmap_base++ = mask;
7757 +               length -= BITS_PER_LONG;
7758 +       }
7759 +
7760 +       if (length > 0) {
7761 +               mask = ~(~0UL << length);
7762 +               if (new_value)
7763 +                       *bitmap_base++ |= mask;
7764 +               else
7765 +                       *bitmap_base++ &= ~mask;
7766 +       }
7767 +}
7768 +
7769 +
7770 +/*
7771 + * this changes the io permissions bitmap in the current task.
7772 + */
7773 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
7774 +{
7775 +       struct thread_struct * t = &current->thread;
7776 +       unsigned long *bitmap;
7777 +       struct physdev_set_iobitmap set_iobitmap;
7778 +
7779 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
7780 +               return -EINVAL;
7781 +       if (turn_on && !capable(CAP_SYS_RAWIO))
7782 +               return -EPERM;
7783 +
7784 +       /*
7785 +        * If it's the first ioperm() call in this thread's lifetime, set the
7786 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
7787 +        * this is why we delay this operation until now:
7788 +        */
7789 +       if (!t->io_bitmap_ptr) {
7790 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
7791 +               if (!bitmap)
7792 +                       return -ENOMEM;
7793 +
7794 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
7795 +               t->io_bitmap_ptr = bitmap;
7796 +
7797 +               set_iobitmap.bitmap   = (char *)bitmap;
7798 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
7799 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
7800 +       }
7801 +
7802 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
7803 +
7804 +       return 0;
7805 +}
7806 +
7807 +/*
7808 + * sys_iopl has to be used when you want to access the IO ports
7809 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
7810 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
7811 + *
7812 + * Here we just change the eflags value on the stack: we allow
7813 + * only the super-user to do it. This depends on the stack-layout
7814 + * on system-call entry - see also fork() and the signal handling
7815 + * code.
7816 + */
7817 +
7818 +asmlinkage long sys_iopl(unsigned long unused)
7819 +{
7820 +       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
7821 +       unsigned int level = regs->ebx;
7822 +       struct thread_struct *t = &current->thread;
7823 +       unsigned int old = (t->iopl >> 12) & 3;
7824 +
7825 +       if (level > 3)
7826 +               return -EINVAL;
7827 +       /* Trying to gain more privileges? */
7828 +       if (level > old) {
7829 +               if (!capable(CAP_SYS_RAWIO))
7830 +                       return -EPERM;
7831 +       }
7832 +       t->iopl = level << 12;
7833 +       set_iopl_mask(t->iopl);
7834 +       return 0;
7835 +}
7836 diff -ruNp linux-2.6.19/arch/i386/kernel/irq-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/irq-xen.c
7837 --- linux-2.6.19/arch/i386/kernel/irq-xen.c     1970-01-01 00:00:00.000000000 +0000
7838 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/irq-xen.c   2007-02-02 19:10:21.000000000 +0000
7839 @@ -0,0 +1,328 @@
7840 +/*
7841 + *     linux/arch/i386/kernel/irq.c
7842 + *
7843 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
7844 + *
7845 + * This file contains the lowest level x86-specific interrupt
7846 + * entry, irq-stacks and irq statistics code. All the remaining
7847 + * irq logic is done by the generic kernel/irq/ code and
7848 + * by the x86-specific irq controller code. (e.g. i8259.c and
7849 + * io_apic.c.)
7850 + */
7851 +
7852 +#include <asm/uaccess.h>
7853 +#include <linux/module.h>
7854 +#include <linux/seq_file.h>
7855 +#include <linux/interrupt.h>
7856 +#include <linux/kernel_stat.h>
7857 +#include <linux/notifier.h>
7858 +#include <linux/cpu.h>
7859 +#include <linux/delay.h>
7860 +
7861 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
7862 +EXPORT_PER_CPU_SYMBOL(irq_stat);
7863 +
7864 +#ifndef CONFIG_X86_LOCAL_APIC
7865 +/*
7866 + * 'what should we do if we get a hw irq event on an illegal vector'.
7867 + * each architecture has to answer this themselves.
7868 + */
7869 +void ack_bad_irq(unsigned int irq)
7870 +{
7871 +       printk("unexpected IRQ trap at vector %02x\n", irq);
7872 +}
7873 +#endif
7874 +
7875 +#ifdef CONFIG_4KSTACKS
7876 +/*
7877 + * per-CPU IRQ handling contexts (thread information and stack)
7878 + */
7879 +union irq_ctx {
7880 +       struct thread_info      tinfo;
7881 +       u32                     stack[THREAD_SIZE/sizeof(u32)];
7882 +};
7883 +
7884 +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
7885 +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
7886 +#endif
7887 +
7888 +/*
7889 + * do_IRQ handles all normal device IRQ's (the special
7890 + * SMP cross-CPU interrupts have their own specific
7891 + * handlers).
7892 + */
7893 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
7894 +{      
7895 +       struct pt_regs *old_regs;
7896 +       /* high bit used in ret_from_ code */
7897 +       int irq = ~regs->orig_eax;
7898 +       struct irq_desc *desc = irq_desc + irq;
7899 +#ifdef CONFIG_4KSTACKS
7900 +       union irq_ctx *curctx, *irqctx;
7901 +       u32 *isp;
7902 +#endif
7903 +
7904 +       if (unlikely((unsigned)irq >= NR_IRQS)) {
7905 +               printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
7906 +                                       __FUNCTION__, irq);
7907 +               BUG();
7908 +       }
7909 +
7910 +       old_regs = set_irq_regs(regs);
7911 +       irq_enter();
7912 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
7913 +       /* Debugging check for stack overflow: is there less than 1KB free? */
7914 +       {
7915 +               long esp;
7916 +
7917 +               __asm__ __volatile__("andl %%esp,%0" :
7918 +                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
7919 +               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
7920 +                       printk("do_IRQ: stack overflow: %ld\n",
7921 +                               esp - sizeof(struct thread_info));
7922 +                       dump_stack();
7923 +               }
7924 +       }
7925 +#endif
7926 +
7927 +#ifdef CONFIG_4KSTACKS
7928 +
7929 +       curctx = (union irq_ctx *) current_thread_info();
7930 +       irqctx = hardirq_ctx[smp_processor_id()];
7931 +
7932 +       /*
7933 +        * this is where we switch to the IRQ stack. However, if we are
7934 +        * already using the IRQ stack (because we interrupted a hardirq
7935 +        * handler) we can't do that and just have to keep using the
7936 +        * current stack (which is the irq stack already after all)
7937 +        */
7938 +       if (curctx != irqctx) {
7939 +               int arg1, arg2, ebx;
7940 +
7941 +               /* build the stack frame on the IRQ stack */
7942 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
7943 +               irqctx->tinfo.task = curctx->tinfo.task;
7944 +               irqctx->tinfo.previous_esp = current_stack_pointer;
7945 +
7946 +               /*
7947 +                * Copy the softirq bits in preempt_count so that the
7948 +                * softirq checks work in the hardirq context.
7949 +                */
7950 +               irqctx->tinfo.preempt_count =
7951 +                       (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
7952 +                       (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
7953 +
7954 +               asm volatile(
7955 +                       "       xchgl  %%ebx,%%esp      \n"
7956 +                       "       call   *%%edi           \n"
7957 +                       "       movl   %%ebx,%%esp      \n"
7958 +                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
7959 +                       :  "0" (irq),   "1" (desc),  "2" (isp),
7960 +                          "D" (desc->handle_irq)
7961 +                       : "memory", "cc"
7962 +               );
7963 +       } else
7964 +#endif
7965 +               desc->handle_irq(irq, desc);
7966 +
7967 +       irq_exit();
7968 +       set_irq_regs(old_regs);
7969 +       return 1;
7970 +}
7971 +
7972 +#ifdef CONFIG_4KSTACKS
7973 +
7974 +/*
7975 + * These should really be __section__(".bss.page_aligned") as well, but
7976 + * gcc's 3.0 and earlier don't handle that correctly.
7977 + */
7978 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
7979 +               __attribute__((__aligned__(THREAD_SIZE)));
7980 +
7981 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
7982 +               __attribute__((__aligned__(THREAD_SIZE)));
7983 +
7984 +/*
7985 + * allocate per-cpu stacks for hardirq and for softirq processing
7986 + */
7987 +void irq_ctx_init(int cpu)
7988 +{
7989 +       union irq_ctx *irqctx;
7990 +
7991 +       if (hardirq_ctx[cpu])
7992 +               return;
7993 +
7994 +       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
7995 +       irqctx->tinfo.task              = NULL;
7996 +       irqctx->tinfo.exec_domain       = NULL;
7997 +       irqctx->tinfo.cpu               = cpu;
7998 +       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
7999 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
8000 +
8001 +       hardirq_ctx[cpu] = irqctx;
8002 +
8003 +       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8004 +       irqctx->tinfo.task              = NULL;
8005 +       irqctx->tinfo.exec_domain       = NULL;
8006 +       irqctx->tinfo.cpu               = cpu;
8007 +       irqctx->tinfo.preempt_count     = 0;
8008 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
8009 +
8010 +       softirq_ctx[cpu] = irqctx;
8011 +
8012 +       printk("CPU %u irqstacks, hard=%p soft=%p\n",
8013 +               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8014 +}
8015 +
8016 +void irq_ctx_exit(int cpu)
8017 +{
8018 +       hardirq_ctx[cpu] = NULL;
8019 +}
8020 +
8021 +extern asmlinkage void __do_softirq(void);
8022 +
8023 +asmlinkage void do_softirq(void)
8024 +{
8025 +       unsigned long flags;
8026 +       struct thread_info *curctx;
8027 +       union irq_ctx *irqctx;
8028 +       u32 *isp;
8029 +
8030 +       if (in_interrupt())
8031 +               return;
8032 +
8033 +       local_irq_save(flags);
8034 +
8035 +       if (local_softirq_pending()) {
8036 +               curctx = current_thread_info();
8037 +               irqctx = softirq_ctx[smp_processor_id()];
8038 +               irqctx->tinfo.task = curctx->task;
8039 +               irqctx->tinfo.previous_esp = current_stack_pointer;
8040 +
8041 +               /* build the stack frame on the softirq stack */
8042 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8043 +
8044 +               asm volatile(
8045 +                       "       xchgl   %%ebx,%%esp     \n"
8046 +                       "       call    __do_softirq    \n"
8047 +                       "       movl    %%ebx,%%esp     \n"
8048 +                       : "=b"(isp)
8049 +                       : "0"(isp)
8050 +                       : "memory", "cc", "edx", "ecx", "eax"
8051 +               );
8052 +               /*
8053 +                * Shouldnt happen, we returned above if in_interrupt():
8054 +                */
8055 +               WARN_ON_ONCE(softirq_count());
8056 +       }
8057 +
8058 +       local_irq_restore(flags);
8059 +}
8060 +
8061 +EXPORT_SYMBOL(do_softirq);
8062 +#endif
8063 +
8064 +/*
8065 + * Interrupt statistics:
8066 + */
8067 +
8068 +atomic_t irq_err_count;
8069 +
8070 +/*
8071 + * /proc/interrupts printing:
8072 + */
8073 +
8074 +int show_interrupts(struct seq_file *p, void *v)
8075 +{
8076 +       int i = *(loff_t *) v, j;
8077 +       struct irqaction * action;
8078 +       unsigned long flags;
8079 +
8080 +       if (i == 0) {
8081 +               seq_printf(p, "           ");
8082 +               for_each_online_cpu(j)
8083 +                       seq_printf(p, "CPU%-8d",j);
8084 +               seq_putc(p, '\n');
8085 +       }
8086 +
8087 +       if (i < NR_IRQS) {
8088 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
8089 +               action = irq_desc[i].action;
8090 +               if (!action)
8091 +                       goto skip;
8092 +               seq_printf(p, "%3d: ",i);
8093 +#ifndef CONFIG_SMP
8094 +               seq_printf(p, "%10u ", kstat_irqs(i));
8095 +#else
8096 +               for_each_online_cpu(j)
8097 +                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
8098 +#endif
8099 +               seq_printf(p, " %8s", irq_desc[i].chip->name);
8100 +               seq_printf(p, "-%-8s", irq_desc[i].name);
8101 +               seq_printf(p, "  %s", action->name);
8102 +
8103 +               for (action=action->next; action; action = action->next)
8104 +                       seq_printf(p, ", %s", action->name);
8105 +
8106 +               seq_putc(p, '\n');
8107 +skip:
8108 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
8109 +       } else if (i == NR_IRQS) {
8110 +               seq_printf(p, "NMI: ");
8111 +               for_each_online_cpu(j)
8112 +                       seq_printf(p, "%10u ", nmi_count(j));
8113 +               seq_putc(p, '\n');
8114 +#ifdef CONFIG_X86_LOCAL_APIC
8115 +               seq_printf(p, "LOC: ");
8116 +               for_each_online_cpu(j)
8117 +                       seq_printf(p, "%10u ",
8118 +                               per_cpu(irq_stat,j).apic_timer_irqs);
8119 +               seq_putc(p, '\n');
8120 +#endif
8121 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
8122 +#if defined(CONFIG_X86_IO_APIC)
8123 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
8124 +#endif
8125 +       }
8126 +       return 0;
8127 +}
8128 +
8129 +#ifdef CONFIG_HOTPLUG_CPU
8130 +void fixup_irqs(cpumask_t map)
8131 +{
8132 +       unsigned int irq;
8133 +       static int warned;
8134 +
8135 +       for (irq = 0; irq < NR_IRQS; irq++) {
8136 +               cpumask_t mask;
8137 +               if (irq == 2)
8138 +                       continue;
8139 +
8140 +               cpus_and(mask, irq_desc[irq].affinity, map);
8141 +               if (any_online_cpu(mask) == NR_CPUS) {
8142 +                       /*printk("Breaking affinity for irq %i\n", irq);*/
8143 +                       mask = map;
8144 +               }
8145 +               if (irq_desc[irq].chip->set_affinity)
8146 +                       irq_desc[irq].chip->set_affinity(irq, mask);
8147 +               else if (irq_desc[irq].action && !(warned++))
8148 +                       printk("Cannot set affinity for irq %i\n", irq);
8149 +       }
8150 +
8151 +#if 0
8152 +       barrier();
8153 +       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
8154 +          [note the nop - the interrupt-enable boundary on x86 is two
8155 +          instructions from sti] - to flush out pending hardirqs and
8156 +          IPIs. After this point nothing is supposed to reach this CPU." */
8157 +       __asm__ __volatile__("sti; nop; cli");
8158 +       barrier();
8159 +#else
8160 +       /* That doesn't seem sufficient.  Give it 1ms. */
8161 +       local_irq_enable();
8162 +       mdelay(1);
8163 +       local_irq_disable();
8164 +#endif
8165 +}
8166 +#endif
8167 +
8168 diff -ruNp linux-2.6.19/arch/i386/kernel/ldt-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/ldt-xen.c
8169 --- linux-2.6.19/arch/i386/kernel/ldt-xen.c     1970-01-01 00:00:00.000000000 +0000
8170 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/ldt-xen.c   2007-02-02 19:10:21.000000000 +0000
8171 @@ -0,0 +1,270 @@
8172 +/*
8173 + * linux/arch/i386/kernel/ldt.c
8174 + *
8175 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
8176 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
8177 + */
8178 +
8179 +#include <linux/errno.h>
8180 +#include <linux/sched.h>
8181 +#include <linux/string.h>
8182 +#include <linux/mm.h>
8183 +#include <linux/smp.h>
8184 +#include <linux/smp_lock.h>
8185 +#include <linux/vmalloc.h>
8186 +#include <linux/slab.h>
8187 +
8188 +#include <asm/uaccess.h>
8189 +#include <asm/system.h>
8190 +#include <asm/ldt.h>
8191 +#include <asm/desc.h>
8192 +#include <asm/mmu_context.h>
8193 +
8194 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
8195 +static void flush_ldt(void *null)
8196 +{
8197 +       if (current->active_mm)
8198 +               load_LDT(&current->active_mm->context);
8199 +}
8200 +#endif
8201 +
8202 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
8203 +{
8204 +       void *oldldt;
8205 +       void *newldt;
8206 +       int oldsize;
8207 +
8208 +       if (mincount <= pc->size)
8209 +               return 0;
8210 +       oldsize = pc->size;
8211 +       mincount = (mincount+511)&(~511);
8212 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
8213 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
8214 +       else
8215 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
8216 +
8217 +       if (!newldt)
8218 +               return -ENOMEM;
8219 +
8220 +       if (oldsize)
8221 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
8222 +       oldldt = pc->ldt;
8223 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
8224 +       pc->ldt = newldt;
8225 +       wmb();
8226 +       pc->size = mincount;
8227 +       wmb();
8228 +
8229 +       if (reload) {
8230 +#ifdef CONFIG_SMP
8231 +               cpumask_t mask;
8232 +               preempt_disable();
8233 +#endif
8234 +               make_pages_readonly(
8235 +                       pc->ldt,
8236 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
8237 +                       XENFEAT_writable_descriptor_tables);
8238 +               load_LDT(pc);
8239 +#ifdef CONFIG_SMP
8240 +               mask = cpumask_of_cpu(smp_processor_id());
8241 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8242 +                       smp_call_function(flush_ldt, NULL, 1, 1);
8243 +               preempt_enable();
8244 +#endif
8245 +       }
8246 +       if (oldsize) {
8247 +               make_pages_writable(
8248 +                       oldldt,
8249 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
8250 +                       XENFEAT_writable_descriptor_tables);
8251 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
8252 +                       vfree(oldldt);
8253 +               else
8254 +                       kfree(oldldt);
8255 +       }
8256 +       return 0;
8257 +}
8258 +
8259 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
8260 +{
8261 +       int err = alloc_ldt(new, old->size, 0);
8262 +       if (err < 0)
8263 +               return err;
8264 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
8265 +       make_pages_readonly(
8266 +               new->ldt,
8267 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
8268 +               XENFEAT_writable_descriptor_tables);
8269 +       return 0;
8270 +}
8271 +
8272 +/*
8273 + * we do not have to muck with descriptors here, that is
8274 + * done in switch_mm() as needed.
8275 + */
8276 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
8277 +{
8278 +       struct mm_struct * old_mm;
8279 +       int retval = 0;
8280 +
8281 +       init_MUTEX(&mm->context.sem);
8282 +       mm->context.size = 0;
8283 +       mm->context.has_foreign_mappings = 0;
8284 +       old_mm = current->mm;
8285 +       if (old_mm && old_mm->context.size > 0) {
8286 +               down(&old_mm->context.sem);
8287 +               retval = copy_ldt(&mm->context, &old_mm->context);
8288 +               up(&old_mm->context.sem);
8289 +       }
8290 +       return retval;
8291 +}
8292 +
8293 +/*
8294 + * No need to lock the MM as we are the last user
8295 + */
8296 +void destroy_context(struct mm_struct *mm)
8297 +{
8298 +       if (mm->context.size) {
8299 +               if (mm == current->active_mm)
8300 +                       clear_LDT();
8301 +               make_pages_writable(
8302 +                       mm->context.ldt,
8303 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
8304 +                       XENFEAT_writable_descriptor_tables);
8305 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
8306 +                       vfree(mm->context.ldt);
8307 +               else
8308 +                       kfree(mm->context.ldt);
8309 +               mm->context.size = 0;
8310 +       }
8311 +}
8312 +
8313 +static int read_ldt(void __user * ptr, unsigned long bytecount)
8314 +{
8315 +       int err;
8316 +       unsigned long size;
8317 +       struct mm_struct * mm = current->mm;
8318 +
8319 +       if (!mm->context.size)
8320 +               return 0;
8321 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
8322 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
8323 +
8324 +       down(&mm->context.sem);
8325 +       size = mm->context.size*LDT_ENTRY_SIZE;
8326 +       if (size > bytecount)
8327 +               size = bytecount;
8328 +
8329 +       err = 0;
8330 +       if (copy_to_user(ptr, mm->context.ldt, size))
8331 +               err = -EFAULT;
8332 +       up(&mm->context.sem);
8333 +       if (err < 0)
8334 +               goto error_return;
8335 +       if (size != bytecount) {
8336 +               /* zero-fill the rest */
8337 +               if (clear_user(ptr+size, bytecount-size) != 0) {
8338 +                       err = -EFAULT;
8339 +                       goto error_return;
8340 +               }
8341 +       }
8342 +       return bytecount;
8343 +error_return:
8344 +       return err;
8345 +}
8346 +
8347 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
8348 +{
8349 +       int err;
8350 +       unsigned long size;
8351 +       void *address;
8352 +
8353 +       err = 0;
8354 +       address = &default_ldt[0];
8355 +       size = 5*sizeof(struct desc_struct);
8356 +       if (size > bytecount)
8357 +               size = bytecount;
8358 +
8359 +       err = size;
8360 +       if (copy_to_user(ptr, address, size))
8361 +               err = -EFAULT;
8362 +
8363 +       return err;
8364 +}
8365 +
8366 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
8367 +{
8368 +       struct mm_struct * mm = current->mm;
8369 +       __u32 entry_1, entry_2;
8370 +       int error;
8371 +       struct user_desc ldt_info;
8372 +
8373 +       error = -EINVAL;
8374 +       if (bytecount != sizeof(ldt_info))
8375 +               goto out;
8376 +       error = -EFAULT;        
8377 +       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
8378 +               goto out;
8379 +
8380 +       error = -EINVAL;
8381 +       if (ldt_info.entry_number >= LDT_ENTRIES)
8382 +               goto out;
8383 +       if (ldt_info.contents == 3) {
8384 +               if (oldmode)
8385 +                       goto out;
8386 +               if (ldt_info.seg_not_present == 0)
8387 +                       goto out;
8388 +       }
8389 +
8390 +       down(&mm->context.sem);
8391 +       if (ldt_info.entry_number >= mm->context.size) {
8392 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
8393 +               if (error < 0)
8394 +                       goto out_unlock;
8395 +       }
8396 +
8397 +       /* Allow LDTs to be cleared by the user. */
8398 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
8399 +               if (oldmode || LDT_empty(&ldt_info)) {
8400 +                       entry_1 = 0;
8401 +                       entry_2 = 0;
8402 +                       goto install;
8403 +               }
8404 +       }
8405 +
8406 +       entry_1 = LDT_entry_a(&ldt_info);
8407 +       entry_2 = LDT_entry_b(&ldt_info);
8408 +       if (oldmode)
8409 +               entry_2 &= ~(1 << 20);
8410 +
8411 +       /* Install the new entry ...  */
8412 +install:
8413 +       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
8414 +                               entry_1, entry_2);
8415 +
8416 +out_unlock:
8417 +       up(&mm->context.sem);
8418 +out:
8419 +       return error;
8420 +}
8421 +
8422 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
8423 +{
8424 +       int ret = -ENOSYS;
8425 +
8426 +       switch (func) {
8427 +       case 0:
8428 +               ret = read_ldt(ptr, bytecount);
8429 +               break;
8430 +       case 1:
8431 +               ret = write_ldt(ptr, bytecount, 1);
8432 +               break;
8433 +       case 2:
8434 +               ret = read_default_ldt(ptr, bytecount);
8435 +               break;
8436 +       case 0x11:
8437 +               ret = write_ldt(ptr, bytecount, 0);
8438 +               break;
8439 +       }
8440 +       return ret;
8441 +}
8442 diff -ruNp linux-2.6.19/arch/i386/kernel/machine_kexec.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/machine_kexec.c
8443 --- linux-2.6.19/arch/i386/kernel/machine_kexec.c       2006-11-29 21:57:37.000000000 +0000
8444 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/machine_kexec.c     2007-02-02 19:10:21.000000000 +0000
8445 @@ -20,6 +20,10 @@
8446  #include <asm/desc.h>
8447  #include <asm/system.h>
8448  
8449 +#ifdef CONFIG_XEN
8450 +#include <xen/interface/kexec.h>
8451 +#endif
8452 +
8453  #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
8454  static u32 kexec_pgd[1024] PAGE_ALIGNED;
8455  #ifdef CONFIG_X86_PAE
8456 @@ -29,6 +33,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
8457  static u32 kexec_pte0[1024] PAGE_ALIGNED;
8458  static u32 kexec_pte1[1024] PAGE_ALIGNED;
8459  
8460 +#ifdef CONFIG_XEN
8461 +
8462 +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
8463 +
8464 +#if PAGES_NR > KEXEC_XEN_NO_PAGES
8465 +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
8466 +#endif
8467 +
8468 +#if PA_CONTROL_PAGE != 0
8469 +#error PA_CONTROL_PAGE is non zero - Xen support will break
8470 +#endif
8471 +
8472 +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
8473 +{
8474 +       void *control_page;
8475 +
8476 +       memset(xki->page_list, 0, sizeof(xki->page_list));
8477 +
8478 +       control_page = page_address(image->control_code_page);
8479 +       memcpy(control_page, relocate_kernel, PAGE_SIZE);
8480 +
8481 +       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
8482 +       xki->page_list[PA_PGD] = __ma(kexec_pgd);
8483 +#ifdef CONFIG_X86_PAE
8484 +       xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
8485 +       xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
8486 +#endif
8487 +       xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
8488 +       xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
8489 +
8490 +}
8491 +
8492 +#endif /* CONFIG_XEN */
8493 +
8494  static void set_idt(void *newidt, __u16 limit)
8495  {
8496         struct Xgt_desc_struct curidt;
8497 @@ -97,6 +135,7 @@ void machine_kexec_cleanup(struct kimage
8498  {
8499  }
8500  
8501 +#ifndef CONFIG_XEN
8502  /*
8503   * Do not allocate memory (or fail in any way) in machine_kexec().
8504   * We are past the point of no return, committed to rebooting now.
8505 @@ -147,6 +186,7 @@ NORET_TYPE void machine_kexec(struct kim
8506         relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
8507                         image->start, cpu_has_pae);
8508  }
8509 +#endif
8510  
8511  /* crashkernel=size@addr specifies the location to reserve for
8512   * a crash kernel.  By reserving this memory we guarantee
8513 diff -ruNp linux-2.6.19/arch/i386/kernel/microcode-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/microcode-xen.c
8514 --- linux-2.6.19/arch/i386/kernel/microcode-xen.c       1970-01-01 00:00:00.000000000 +0000
8515 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/microcode-xen.c     2007-02-02 19:10:21.000000000 +0000
8516 @@ -0,0 +1,141 @@
8517 +/*
8518 + *     Intel CPU Microcode Update Driver for Linux
8519 + *
8520 + *     Copyright (C) 2000-2004 Tigran Aivazian
8521 + *
8522 + *     This driver allows to upgrade microcode on Intel processors
8523 + *     belonging to IA-32 family - PentiumPro, Pentium II, 
8524 + *     Pentium III, Xeon, Pentium 4, etc.
8525 + *
8526 + *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
8527 + *     Order Number 245472 or free download from:
8528 + *             
8529 + *     http://developer.intel.com/design/pentium4/manuals/245472.htm
8530 + *
8531 + *     For more information, go to http://www.urbanmyth.org/microcode
8532 + *
8533 + *     This program is free software; you can redistribute it and/or
8534 + *     modify it under the terms of the GNU General Public License
8535 + *     as published by the Free Software Foundation; either version
8536 + *     2 of the License, or (at your option) any later version.
8537 + */
8538 +
8539 +//#define DEBUG /* pr_debug */
8540 +#include <linux/capability.h>
8541 +#include <linux/kernel.h>
8542 +#include <linux/init.h>
8543 +#include <linux/sched.h>
8544 +#include <linux/cpumask.h>
8545 +#include <linux/module.h>
8546 +#include <linux/slab.h>
8547 +#include <linux/vmalloc.h>
8548 +#include <linux/miscdevice.h>
8549 +#include <linux/spinlock.h>
8550 +#include <linux/mm.h>
8551 +#include <linux/mutex.h>
8552 +#include <linux/syscalls.h>
8553 +
8554 +#include <asm/msr.h>
8555 +#include <asm/uaccess.h>
8556 +#include <asm/processor.h>
8557 +
8558 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
8559 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
8560 +MODULE_LICENSE("GPL");
8561 +
8562 +#define MICROCODE_VERSION      "1.14-xen"
8563 +
8564 +#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
8565 +#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
8566 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
8567 +
8568 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
8569 +static DEFINE_MUTEX(microcode_mutex);
8570 +                               
8571 +static int microcode_open (struct inode *unused1, struct file *unused2)
8572 +{
8573 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8574 +}
8575 +
8576 +
8577 +static int do_microcode_update (const void __user *ubuf, size_t len)
8578 +{
8579 +       int err;
8580 +       void *kbuf;
8581 +
8582 +       kbuf = vmalloc(len);
8583 +       if (!kbuf)
8584 +               return -ENOMEM;
8585 +
8586 +       if (copy_from_user(kbuf, ubuf, len) == 0) {
8587 +               dom0_op_t op;
8588 +
8589 +               op.cmd = DOM0_MICROCODE;
8590 +               set_xen_guest_handle(op.u.microcode.data, kbuf);
8591 +               op.u.microcode.length = len;
8592 +               err = HYPERVISOR_dom0_op(&op);
8593 +       } else
8594 +               err = -EFAULT;
8595 +
8596 +       vfree(kbuf);
8597 +
8598 +       return err;
8599 +}
8600 +
8601 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
8602 +{
8603 +       ssize_t ret;
8604 +
8605 +       if (len < DEFAULT_UCODE_TOTALSIZE) {
8606 +               printk(KERN_ERR "microcode: not enough data\n"); 
8607 +               return -EINVAL;
8608 +       }
8609 +
8610 +       mutex_lock(&microcode_mutex);
8611 +
8612 +       ret = do_microcode_update(buf, len);
8613 +       if (!ret)
8614 +               ret = (ssize_t)len;
8615 +
8616 +       mutex_unlock(&microcode_mutex);
8617 +
8618 +       return ret;
8619 +}
8620 +
8621 +static struct file_operations microcode_fops = {
8622 +       .owner          = THIS_MODULE,
8623 +       .write          = microcode_write,
8624 +       .open           = microcode_open,
8625 +};
8626 +
8627 +static struct miscdevice microcode_dev = {
8628 +       .minor          = MICROCODE_MINOR,
8629 +       .name           = "microcode",
8630 +       .fops           = &microcode_fops,
8631 +};
8632 +
8633 +static int __init microcode_init (void)
8634 +{
8635 +       int error;
8636 +
8637 +       error = misc_register(&microcode_dev);
8638 +       if (error) {
8639 +               printk(KERN_ERR
8640 +                       "microcode: can't misc_register on minor=%d\n",
8641 +                       MICROCODE_MINOR);
8642 +               return error;
8643 +       }
8644 +
8645 +       printk(KERN_INFO 
8646 +               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
8647 +       return 0;
8648 +}
8649 +
8650 +static void __exit microcode_exit (void)
8651 +{
8652 +       misc_deregister(&microcode_dev);
8653 +}
8654 +
8655 +module_init(microcode_init)
8656 +module_exit(microcode_exit)
8657 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
8658 diff -ruNp linux-2.6.19/arch/i386/kernel/mpparse-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/mpparse-xen.c
8659 --- linux-2.6.19/arch/i386/kernel/mpparse-xen.c 1970-01-01 00:00:00.000000000 +0000
8660 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/mpparse-xen.c       2007-02-02 19:10:21.000000000 +0000
8661 @@ -0,0 +1,1157 @@
8662 +/*
8663 + *     Intel Multiprocessor Specification 1.1 and 1.4
8664 + *     compliant MP-table parsing routines.
8665 + *
8666 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8667 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
8668 + *
8669 + *     Fixes
8670 + *             Erich Boleyn    :       MP v1.4 and additional changes.
8671 + *             Alan Cox        :       Added EBDA scanning
8672 + *             Ingo Molnar     :       various cleanups and rewrites
8673 + *             Maciej W. Rozycki:      Bits for default MP configurations
8674 + *             Paul Diefenbaugh:       Added full ACPI support
8675 + */
8676 +
8677 +#include <linux/mm.h>
8678 +#include <linux/init.h>
8679 +#include <linux/acpi.h>
8680 +#include <linux/delay.h>
8681 +#include <linux/bootmem.h>
8682 +#include <linux/smp_lock.h>
8683 +#include <linux/kernel_stat.h>
8684 +#include <linux/mc146818rtc.h>
8685 +#include <linux/bitops.h>
8686 +
8687 +#include <asm/smp.h>
8688 +#include <asm/acpi.h>
8689 +#include <asm/mtrr.h>
8690 +#include <asm/mpspec.h>
8691 +#include <asm/io_apic.h>
8692 +
8693 +#include <mach_apic.h>
8694 +#include <mach_apicdef.h>
8695 +#include <mach_mpparse.h>
8696 +#include <bios_ebda.h>
8697 +
8698 +/* Have we found an MP table */
8699 +int smp_found_config;
8700 +unsigned int __initdata maxcpus = NR_CPUS;
8701 +
8702 +/*
8703 + * Various Linux-internal data structures created from the
8704 + * MP-table.
8705 + */
8706 +int apic_version [MAX_APICS];
8707 +int mp_bus_id_to_type [MAX_MP_BUSSES];
8708 +int mp_bus_id_to_node [MAX_MP_BUSSES];
8709 +int mp_bus_id_to_local [MAX_MP_BUSSES];
8710 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
8711 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
8712 +static int mp_current_pci_id;
8713 +
8714 +/* I/O APIC entries */
8715 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
8716 +
8717 +/* # of MP IRQ source entries */
8718 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
8719 +
8720 +/* MP IRQ source entries */
8721 +int mp_irq_entries;
8722 +
8723 +int nr_ioapics;
8724 +
8725 +int pic_mode;
8726 +unsigned long mp_lapic_addr;
8727 +
8728 +unsigned int def_to_bigsmp = 0;
8729 +
8730 +/* Processor that is doing the boot up */
8731 +unsigned int boot_cpu_physical_apicid = -1U;
8732 +/* Internal processor count */
8733 +unsigned int __cpuinitdata num_processors;
8734 +
8735 +/* Bitmask of physically existing CPUs */
8736 +physid_mask_t phys_cpu_present_map;
8737 +
8738 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
8739 +
8740 +/*
8741 + * Intel MP BIOS table parsing routines:
8742 + */
8743 +
8744 +
8745 +/*
8746 + * Checksum an MP configuration block.
8747 + */
8748 +
8749 +static int __init mpf_checksum(unsigned char *mp, int len)
8750 +{
8751 +       int sum = 0;
8752 +
8753 +       while (len--)
8754 +               sum += *mp++;
8755 +
8756 +       return sum & 0xFF;
8757 +}
8758 +
8759 +/*
8760 + * Have to match translation table entries to main table entries by counter
8761 + * hence the mpc_record variable .... can't see a less disgusting way of
8762 + * doing this ....
8763 + */
8764 +
8765 +static int mpc_record; 
8766 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
8767 +
8768 +#ifndef CONFIG_XEN
8769 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
8770 +{
8771 +       int ver, apicid;
8772 +       physid_mask_t phys_cpu;
8773 +       
8774 +       if (!(m->mpc_cpuflag & CPU_ENABLED))
8775 +               return;
8776 +
8777 +       apicid = mpc_apic_id(m, translation_table[mpc_record]);
8778 +
8779 +       if (m->mpc_featureflag&(1<<0))
8780 +               Dprintk("    Floating point unit present.\n");
8781 +       if (m->mpc_featureflag&(1<<7))
8782 +               Dprintk("    Machine Exception supported.\n");
8783 +       if (m->mpc_featureflag&(1<<8))
8784 +               Dprintk("    64 bit compare & exchange supported.\n");
8785 +       if (m->mpc_featureflag&(1<<9))
8786 +               Dprintk("    Internal APIC present.\n");
8787 +       if (m->mpc_featureflag&(1<<11))
8788 +               Dprintk("    SEP present.\n");
8789 +       if (m->mpc_featureflag&(1<<12))
8790 +               Dprintk("    MTRR  present.\n");
8791 +       if (m->mpc_featureflag&(1<<13))
8792 +               Dprintk("    PGE  present.\n");
8793 +       if (m->mpc_featureflag&(1<<14))
8794 +               Dprintk("    MCA  present.\n");
8795 +       if (m->mpc_featureflag&(1<<15))
8796 +               Dprintk("    CMOV  present.\n");
8797 +       if (m->mpc_featureflag&(1<<16))
8798 +               Dprintk("    PAT  present.\n");
8799 +       if (m->mpc_featureflag&(1<<17))
8800 +               Dprintk("    PSE  present.\n");
8801 +       if (m->mpc_featureflag&(1<<18))
8802 +               Dprintk("    PSN  present.\n");
8803 +       if (m->mpc_featureflag&(1<<19))
8804 +               Dprintk("    Cache Line Flush Instruction present.\n");
8805 +       /* 20 Reserved */
8806 +       if (m->mpc_featureflag&(1<<21))
8807 +               Dprintk("    Debug Trace and EMON Store present.\n");
8808 +       if (m->mpc_featureflag&(1<<22))
8809 +               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
8810 +       if (m->mpc_featureflag&(1<<23))
8811 +               Dprintk("    MMX  present.\n");
8812 +       if (m->mpc_featureflag&(1<<24))
8813 +               Dprintk("    FXSR  present.\n");
8814 +       if (m->mpc_featureflag&(1<<25))
8815 +               Dprintk("    XMM  present.\n");
8816 +       if (m->mpc_featureflag&(1<<26))
8817 +               Dprintk("    Willamette New Instructions  present.\n");
8818 +       if (m->mpc_featureflag&(1<<27))
8819 +               Dprintk("    Self Snoop  present.\n");
8820 +       if (m->mpc_featureflag&(1<<28))
8821 +               Dprintk("    HT  present.\n");
8822 +       if (m->mpc_featureflag&(1<<29))
8823 +               Dprintk("    Thermal Monitor present.\n");
8824 +       /* 30, 31 Reserved */
8825 +
8826 +
8827 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
8828 +               Dprintk("    Bootup CPU\n");
8829 +               boot_cpu_physical_apicid = m->mpc_apicid;
8830 +       }
8831 +
8832 +       ver = m->mpc_apicver;
8833 +
8834 +       /*
8835 +        * Validate version
8836 +        */
8837 +       if (ver == 0x0) {
8838 +               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
8839 +                               "fixing up to 0x10. (tell your hw vendor)\n",
8840 +                               m->mpc_apicid);
8841 +               ver = 0x10;
8842 +       }
8843 +       apic_version[m->mpc_apicid] = ver;
8844 +
8845 +       phys_cpu = apicid_to_cpu_present(apicid);
8846 +       physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
8847 +
8848 +       if (num_processors >= NR_CPUS) {
8849 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
8850 +                       "  Processor ignored.\n", NR_CPUS);
8851 +               return;
8852 +       }
8853 +
8854 +       if (num_processors >= maxcpus) {
8855 +               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
8856 +                       " Processor ignored.\n", maxcpus);
8857 +               return;
8858 +       }
8859 +
8860 +       cpu_set(num_processors, cpu_possible_map);
8861 +       num_processors++;
8862 +
8863 +       /*
8864 +        * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
8865 +        * but we need to work other dependencies like SMP_SUSPEND etc
8866 +        * before this can be done without some confusion.
8867 +        * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
8868 +        *       - Ashok Raj <ashok.raj@intel.com>
8869 +        */
8870 +       if (num_processors > 8) {
8871 +               switch (boot_cpu_data.x86_vendor) {
8872 +               case X86_VENDOR_INTEL:
8873 +                       if (!APIC_XAPIC(ver)) {
8874 +                               def_to_bigsmp = 0;
8875 +                               break;
8876 +                       }
8877 +                       /* If P4 and above fall through */
8878 +               case X86_VENDOR_AMD:
8879 +                       def_to_bigsmp = 1;
8880 +               }
8881 +       }
8882 +       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
8883 +}
8884 +#else
8885 +void __init MP_processor_info (struct mpc_config_processor *m)
8886 +{
8887 +       num_processors++;
8888 +}
8889 +#endif /* CONFIG_XEN */
8890 +
8891 +static void __init MP_bus_info (struct mpc_config_bus *m)
8892 +{
8893 +       char str[7];
8894 +
8895 +       memcpy(str, m->mpc_bustype, 6);
8896 +       str[6] = 0;
8897 +
8898 +       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
8899 +
8900 +#if MAX_MP_BUSSES < 256
8901 +       if (m->mpc_busid >= MAX_MP_BUSSES) {
8902 +               printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
8903 +                       " is too large, max. supported is %d\n",
8904 +                       m->mpc_busid, str, MAX_MP_BUSSES - 1);
8905 +               return;
8906 +       }
8907 +#endif
8908 +
8909 +       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
8910 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
8911 +       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
8912 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
8913 +       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
8914 +               mpc_oem_pci_bus(m, translation_table[mpc_record]);
8915 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
8916 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
8917 +               mp_current_pci_id++;
8918 +       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
8919 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
8920 +       } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
8921 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
8922 +       } else {
8923 +               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
8924 +       }
8925 +}
8926 +
8927 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
8928 +{
8929 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
8930 +               return;
8931 +
8932 +       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
8933 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
8934 +       if (nr_ioapics >= MAX_IO_APICS) {
8935 +               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
8936 +                       MAX_IO_APICS, nr_ioapics);
8937 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
8938 +       }
8939 +       if (!m->mpc_apicaddr) {
8940 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
8941 +                       " found in MP table, skipping!\n");
8942 +               return;
8943 +       }
8944 +       mp_ioapics[nr_ioapics] = *m;
8945 +       nr_ioapics++;
8946 +}
8947 +
8948 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
8949 +{
8950 +       mp_irqs [mp_irq_entries] = *m;
8951 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
8952 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
8953 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
8954 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
8955 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
8956 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
8957 +               panic("Max # of irq sources exceeded!!\n");
8958 +}
8959 +
8960 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
8961 +{
8962 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
8963 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
8964 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
8965 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
8966 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
8967 +}
8968 +
8969 +#ifdef CONFIG_X86_NUMAQ
8970 +static void __init MP_translation_info (struct mpc_config_translation *m)
8971 +{
8972 +       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
8973 +
8974 +       if (mpc_record >= MAX_MPC_ENTRY) 
8975 +               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
8976 +       else
8977 +               translation_table[mpc_record] = m; /* stash this for later */
8978 +       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
8979 +               node_set_online(m->trans_quad);
8980 +}
8981 +
8982 +/*
8983 + * Read/parse the MPC oem tables
8984 + */
8985 +
8986 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
8987 +       unsigned short oemsize)
8988 +{
8989 +       int count = sizeof (*oemtable); /* the header size */
8990 +       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
8991 +       
8992 +       mpc_record = 0;
8993 +       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
8994 +       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
8995 +       {
8996 +               printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
8997 +                       oemtable->oem_signature[0],
8998 +                       oemtable->oem_signature[1],
8999 +                       oemtable->oem_signature[2],
9000 +                       oemtable->oem_signature[3]);
9001 +               return;
9002 +       }
9003 +       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
9004 +       {
9005 +               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9006 +               return;
9007 +       }
9008 +       while (count < oemtable->oem_length) {
9009 +               switch (*oemptr) {
9010 +                       case MP_TRANSLATION:
9011 +                       {
9012 +                               struct mpc_config_translation *m=
9013 +                                       (struct mpc_config_translation *)oemptr;
9014 +                               MP_translation_info(m);
9015 +                               oemptr += sizeof(*m);
9016 +                               count += sizeof(*m);
9017 +                               ++mpc_record;
9018 +                               break;
9019 +                       }
9020 +                       default:
9021 +                       {
9022 +                               printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
9023 +                               return;
9024 +                       }
9025 +               }
9026 +       }
9027 +}
9028 +
9029 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9030 +               char *productid)
9031 +{
9032 +       if (strncmp(oem, "IBM NUMA", 8))
9033 +               printk("Warning!  May not be a NUMA-Q system!\n");
9034 +       if (mpc->mpc_oemptr)
9035 +               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
9036 +                               mpc->mpc_oemsize);
9037 +}
9038 +#endif /* CONFIG_X86_NUMAQ */
9039 +
9040 +/*
9041 + * Read/parse the MPC
9042 + */
9043 +
9044 +static int __init smp_read_mpc(struct mp_config_table *mpc)
9045 +{
9046 +       char str[16];
9047 +       char oem[10];
9048 +       int count=sizeof(*mpc);
9049 +       unsigned char *mpt=((unsigned char *)mpc)+count;
9050 +
9051 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
9052 +               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
9053 +                       *(u32 *)mpc->mpc_signature);
9054 +               return 0;
9055 +       }
9056 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
9057 +               printk(KERN_ERR "SMP mptable: checksum error!\n");
9058 +               return 0;
9059 +       }
9060 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
9061 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
9062 +                       mpc->mpc_spec);
9063 +               return 0;
9064 +       }
9065 +       if (!mpc->mpc_lapic) {
9066 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
9067 +               return 0;
9068 +       }
9069 +       memcpy(oem,mpc->mpc_oem,8);
9070 +       oem[8]=0;
9071 +       printk(KERN_INFO "OEM ID: %s ",oem);
9072 +
9073 +       memcpy(str,mpc->mpc_productid,12);
9074 +       str[12]=0;
9075 +       printk("Product ID: %s ",str);
9076 +
9077 +       mps_oem_check(mpc, oem, str);
9078 +
9079 +       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
9080 +
9081 +       /* 
9082 +        * Save the local APIC address (it might be non-default) -- but only
9083 +        * if we're not using ACPI.
9084 +        */
9085 +       if (!acpi_lapic)
9086 +               mp_lapic_addr = mpc->mpc_lapic;
9087 +
9088 +       /*
9089 +        *      Now process the configuration blocks.
9090 +        */
9091 +       mpc_record = 0;
9092 +       while (count < mpc->mpc_length) {
9093 +               switch(*mpt) {
9094 +                       case MP_PROCESSOR:
9095 +                       {
9096 +                               struct mpc_config_processor *m=
9097 +                                       (struct mpc_config_processor *)mpt;
9098 +                               /* ACPI may have already provided this data */
9099 +                               if (!acpi_lapic)
9100 +                                       MP_processor_info(m);
9101 +                               mpt += sizeof(*m);
9102 +                               count += sizeof(*m);
9103 +                               break;
9104 +                       }
9105 +                       case MP_BUS:
9106 +                       {
9107 +                               struct mpc_config_bus *m=
9108 +                                       (struct mpc_config_bus *)mpt;
9109 +                               MP_bus_info(m);
9110 +                               mpt += sizeof(*m);
9111 +                               count += sizeof(*m);
9112 +                               break;
9113 +                       }
9114 +                       case MP_IOAPIC:
9115 +                       {
9116 +                               struct mpc_config_ioapic *m=
9117 +                                       (struct mpc_config_ioapic *)mpt;
9118 +                               MP_ioapic_info(m);
9119 +                               mpt+=sizeof(*m);
9120 +                               count+=sizeof(*m);
9121 +                               break;
9122 +                       }
9123 +                       case MP_INTSRC:
9124 +                       {
9125 +                               struct mpc_config_intsrc *m=
9126 +                                       (struct mpc_config_intsrc *)mpt;
9127 +
9128 +                               MP_intsrc_info(m);
9129 +                               mpt+=sizeof(*m);
9130 +                               count+=sizeof(*m);
9131 +                               break;
9132 +                       }
9133 +                       case MP_LINTSRC:
9134 +                       {
9135 +                               struct mpc_config_lintsrc *m=
9136 +                                       (struct mpc_config_lintsrc *)mpt;
9137 +                               MP_lintsrc_info(m);
9138 +                               mpt+=sizeof(*m);
9139 +                               count+=sizeof(*m);
9140 +                               break;
9141 +                       }
9142 +                       default:
9143 +                       {
9144 +                               count = mpc->mpc_length;
9145 +                               break;
9146 +                       }
9147 +               }
9148 +               ++mpc_record;
9149 +       }
9150 +       clustered_apic_check();
9151 +       if (!num_processors)
9152 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
9153 +       return num_processors;
9154 +}
9155 +
9156 +static int __init ELCR_trigger(unsigned int irq)
9157 +{
9158 +       unsigned int port;
9159 +
9160 +       port = 0x4d0 + (irq >> 3);
9161 +       return (inb(port) >> (irq & 7)) & 1;
9162 +}
9163 +
9164 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
9165 +{
9166 +       struct mpc_config_intsrc intsrc;
9167 +       int i;
9168 +       int ELCR_fallback = 0;
9169 +
9170 +       intsrc.mpc_type = MP_INTSRC;
9171 +       intsrc.mpc_irqflag = 0;                 /* conforming */
9172 +       intsrc.mpc_srcbus = 0;
9173 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9174 +
9175 +       intsrc.mpc_irqtype = mp_INT;
9176 +
9177 +       /*
9178 +        *  If true, we have an ISA/PCI system with no IRQ entries
9179 +        *  in the MP table. To prevent the PCI interrupts from being set up
9180 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
9181 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
9182 +        *  never be level sensitive, so we simply see if the ELCR agrees.
9183 +        *  If it does, we assume it's valid.
9184 +        */
9185 +       if (mpc_default_type == 5) {
9186 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
9187 +
9188 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
9189 +                       printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
9190 +               else {
9191 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
9192 +                       ELCR_fallback = 1;
9193 +               }
9194 +       }
9195 +
9196 +       for (i = 0; i < 16; i++) {
9197 +               switch (mpc_default_type) {
9198 +               case 2:
9199 +                       if (i == 0 || i == 13)
9200 +                               continue;       /* IRQ0 & IRQ13 not connected */
9201 +                       /* fall through */
9202 +               default:
9203 +                       if (i == 2)
9204 +                               continue;       /* IRQ2 is never connected */
9205 +               }
9206 +
9207 +               if (ELCR_fallback) {
9208 +                       /*
9209 +                        *  If the ELCR indicates a level-sensitive interrupt, we
9210 +                        *  copy that information over to the MP table in the
9211 +                        *  irqflag field (level sensitive, active high polarity).
9212 +                        */
9213 +                       if (ELCR_trigger(i))
9214 +                               intsrc.mpc_irqflag = 13;
9215 +                       else
9216 +                               intsrc.mpc_irqflag = 0;
9217 +               }
9218 +
9219 +               intsrc.mpc_srcbusirq = i;
9220 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
9221 +               MP_intsrc_info(&intsrc);
9222 +       }
9223 +
9224 +       intsrc.mpc_irqtype = mp_ExtINT;
9225 +       intsrc.mpc_srcbusirq = 0;
9226 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
9227 +       MP_intsrc_info(&intsrc);
9228 +}
9229 +
9230 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9231 +{
9232 +       struct mpc_config_processor processor;
9233 +       struct mpc_config_bus bus;
9234 +       struct mpc_config_ioapic ioapic;
9235 +       struct mpc_config_lintsrc lintsrc;
9236 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
9237 +       int i;
9238 +
9239 +       /*
9240 +        * local APIC has default address
9241 +        */
9242 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9243 +
9244 +       /*
9245 +        * 2 CPUs, numbered 0 & 1.
9246 +        */
9247 +       processor.mpc_type = MP_PROCESSOR;
9248 +       /* Either an integrated APIC or a discrete 82489DX. */
9249 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9250 +       processor.mpc_cpuflag = CPU_ENABLED;
9251 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9252 +                                  (boot_cpu_data.x86_model << 4) |
9253 +                                  boot_cpu_data.x86_mask;
9254 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9255 +       processor.mpc_reserved[0] = 0;
9256 +       processor.mpc_reserved[1] = 0;
9257 +       for (i = 0; i < 2; i++) {
9258 +               processor.mpc_apicid = i;
9259 +               MP_processor_info(&processor);
9260 +       }
9261 +
9262 +       bus.mpc_type = MP_BUS;
9263 +       bus.mpc_busid = 0;
9264 +       switch (mpc_default_type) {
9265 +               default:
9266 +                       printk("???\n");
9267 +                       printk(KERN_ERR "Unknown standard configuration %d\n",
9268 +                               mpc_default_type);
9269 +                       /* fall through */
9270 +               case 1:
9271 +               case 5:
9272 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
9273 +                       break;
9274 +               case 2:
9275 +               case 6:
9276 +               case 3:
9277 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
9278 +                       break;
9279 +               case 4:
9280 +               case 7:
9281 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
9282 +       }
9283 +       MP_bus_info(&bus);
9284 +       if (mpc_default_type > 4) {
9285 +               bus.mpc_busid = 1;
9286 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
9287 +               MP_bus_info(&bus);
9288 +       }
9289 +
9290 +       ioapic.mpc_type = MP_IOAPIC;
9291 +       ioapic.mpc_apicid = 2;
9292 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9293 +       ioapic.mpc_flags = MPC_APIC_USABLE;
9294 +       ioapic.mpc_apicaddr = 0xFEC00000;
9295 +       MP_ioapic_info(&ioapic);
9296 +
9297 +       /*
9298 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
9299 +        */
9300 +       construct_default_ioirq_mptable(mpc_default_type);
9301 +
9302 +       lintsrc.mpc_type = MP_LINTSRC;
9303 +       lintsrc.mpc_irqflag = 0;                /* conforming */
9304 +       lintsrc.mpc_srcbusid = 0;
9305 +       lintsrc.mpc_srcbusirq = 0;
9306 +       lintsrc.mpc_destapic = MP_APIC_ALL;
9307 +       for (i = 0; i < 2; i++) {
9308 +               lintsrc.mpc_irqtype = linttypes[i];
9309 +               lintsrc.mpc_destapiclint = i;
9310 +               MP_lintsrc_info(&lintsrc);
9311 +       }
9312 +}
9313 +
9314 +static struct intel_mp_floating *mpf_found;
9315 +
9316 +/*
9317 + * Scan the memory blocks for an SMP configuration block.
9318 + */
9319 +void __init get_smp_config (void)
9320 +{
9321 +       struct intel_mp_floating *mpf = mpf_found;
9322 +
9323 +       /*
9324 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
9325 +        * processors, where MPS only supports physical.
9326 +        */
9327 +       if (acpi_lapic && acpi_ioapic) {
9328 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
9329 +               return;
9330 +       }
9331 +       else if (acpi_lapic)
9332 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
9333 +
9334 +       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
9335 +       if (mpf->mpf_feature2 & (1<<7)) {
9336 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
9337 +               pic_mode = 1;
9338 +       } else {
9339 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
9340 +               pic_mode = 0;
9341 +       }
9342 +
9343 +       /*
9344 +        * Now see if we need to read further.
9345 +        */
9346 +       if (mpf->mpf_feature1 != 0) {
9347 +
9348 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
9349 +               construct_default_ISA_mptable(mpf->mpf_feature1);
9350 +
9351 +       } else if (mpf->mpf_physptr) {
9352 +
9353 +               /*
9354 +                * Read the physical hardware table.  Anything here will
9355 +                * override the defaults.
9356 +                */
9357 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
9358 +                       smp_found_config = 0;
9359 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
9360 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
9361 +                       return;
9362 +               }
9363 +               /*
9364 +                * If there are no explicit MP IRQ entries, then we are
9365 +                * broken.  We set up most of the low 16 IO-APIC pins to
9366 +                * ISA defaults and hope it will work.
9367 +                */
9368 +               if (!mp_irq_entries) {
9369 +                       struct mpc_config_bus bus;
9370 +
9371 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
9372 +
9373 +                       bus.mpc_type = MP_BUS;
9374 +                       bus.mpc_busid = 0;
9375 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
9376 +                       MP_bus_info(&bus);
9377 +
9378 +                       construct_default_ioirq_mptable(0);
9379 +               }
9380 +
9381 +       } else
9382 +               BUG();
9383 +
9384 +       printk(KERN_INFO "Processors: %d\n", num_processors);
9385 +       /*
9386 +        * Only use the first configuration found.
9387 +        */
9388 +}
9389 +
9390 +static int __init smp_scan_config (unsigned long base, unsigned long length)
9391 +{
9392 +       unsigned long *bp = isa_bus_to_virt(base);
9393 +       struct intel_mp_floating *mpf;
9394 +
9395 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
9396 +       if (sizeof(*mpf) != 16)
9397 +               printk("Error: MPF size\n");
9398 +
9399 +       while (length > 0) {
9400 +               mpf = (struct intel_mp_floating *)bp;
9401 +               if ((*bp == SMP_MAGIC_IDENT) &&
9402 +                       (mpf->mpf_length == 1) &&
9403 +                       !mpf_checksum((unsigned char *)bp, 16) &&
9404 +                       ((mpf->mpf_specification == 1)
9405 +                               || (mpf->mpf_specification == 4)) ) {
9406 +
9407 +                       smp_found_config = 1;
9408 +#ifndef CONFIG_XEN
9409 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
9410 +                                               virt_to_phys(mpf));
9411 +                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
9412 +                       if (mpf->mpf_physptr) {
9413 +                               /*
9414 +                                * We cannot access to MPC table to compute
9415 +                                * table size yet, as only few megabytes from
9416 +                                * the bottom is mapped now.
9417 +                                * PC-9800's MPC table places on the very last
9418 +                                * of physical memory; so that simply reserving
9419 +                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9420 +                                * in reserve_bootmem.
9421 +                                */
9422 +                               unsigned long size = PAGE_SIZE;
9423 +                               unsigned long end = max_low_pfn * PAGE_SIZE;
9424 +                               if (mpf->mpf_physptr + size > end)
9425 +                                       size = end - mpf->mpf_physptr;
9426 +                               reserve_bootmem(mpf->mpf_physptr, size);
9427 +                       }
9428 +#else
9429 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
9430 +                               ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
9431 +#endif
9432 +
9433 +                       mpf_found = mpf;
9434 +                       return 1;
9435 +               }
9436 +               bp += 4;
9437 +               length -= 16;
9438 +       }
9439 +       return 0;
9440 +}
9441 +
9442 +void __init find_smp_config (void)
9443 +{
9444 +#ifndef CONFIG_XEN
9445 +       unsigned int address;
9446 +#endif
9447 +
9448 +       /*
9449 +        * FIXME: Linux assumes you have 640K of base ram..
9450 +        * this continues the error...
9451 +        *
9452 +        * 1) Scan the bottom 1K for a signature
9453 +        * 2) Scan the top 1K of base RAM
9454 +        * 3) Scan the 64K of bios
9455 +        */
9456 +       if (smp_scan_config(0x0,0x400) ||
9457 +               smp_scan_config(639*0x400,0x400) ||
9458 +                       smp_scan_config(0xF0000,0x10000))
9459 +               return;
9460 +       /*
9461 +        * If it is an SMP machine we should know now, unless the
9462 +        * configuration is in an EISA/MCA bus machine with an
9463 +        * extended bios data area.
9464 +        *
9465 +        * there is a real-mode segmented pointer pointing to the
9466 +        * 4K EBDA area at 0x40E, calculate and scan it here.
9467 +        *
9468 +        * NOTE! There are Linux loaders that will corrupt the EBDA
9469 +        * area, and as such this kind of SMP config may be less
9470 +        * trustworthy, simply because the SMP table may have been
9471 +        * stomped on during early boot. These loaders are buggy and
9472 +        * should be fixed.
9473 +        *
9474 +        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
9475 +        */
9476 +
9477 +#ifndef CONFIG_XEN
9478 +       address = get_bios_ebda();
9479 +       if (address)
9480 +               smp_scan_config(address, 0x400);
9481 +#endif
9482 +}
9483 +
9484 +int es7000_plat;
9485 +
9486 +/* --------------------------------------------------------------------------
9487 +                            ACPI-based MP Configuration
9488 +   -------------------------------------------------------------------------- */
9489 +
9490 +#ifdef CONFIG_ACPI
9491 +
9492 +void __init mp_register_lapic_address(u64 address)
9493 +{
9494 +#ifndef CONFIG_XEN
9495 +       mp_lapic_addr = (unsigned long) address;
9496 +
9497 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
9498 +
9499 +       if (boot_cpu_physical_apicid == -1U)
9500 +               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
9501 +
9502 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
9503 +#endif
9504 +}
9505 +
9506 +void __devinit mp_register_lapic (u8 id, u8 enabled)
9507 +{
9508 +       struct mpc_config_processor processor;
9509 +       int boot_cpu = 0;
9510 +       
9511 +       if (MAX_APICS - id <= 0) {
9512 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
9513 +                       id, MAX_APICS);
9514 +               return;
9515 +       }
9516 +
9517 +       if (id == boot_cpu_physical_apicid)
9518 +               boot_cpu = 1;
9519 +
9520 +#ifndef CONFIG_XEN
9521 +       processor.mpc_type = MP_PROCESSOR;
9522 +       processor.mpc_apicid = id;
9523 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
9524 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
9525 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
9526 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
9527 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9528 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9529 +       processor.mpc_reserved[0] = 0;
9530 +       processor.mpc_reserved[1] = 0;
9531 +#endif
9532 +
9533 +       MP_processor_info(&processor);
9534 +}
9535 +
9536 +#ifdef CONFIG_X86_IO_APIC
9537 +
9538 +#define MP_ISA_BUS             0
9539 +#define MP_MAX_IOAPIC_PIN      127
9540 +
9541 +static struct mp_ioapic_routing {
9542 +       int                     apic_id;
9543 +       int                     gsi_base;
9544 +       int                     gsi_end;
9545 +       u32                     pin_programmed[4];
9546 +} mp_ioapic_routing[MAX_IO_APICS];
9547 +
9548 +static int mp_find_ioapic (int gsi)
9549 +{
9550 +       int i = 0;
9551 +
9552 +       /* Find the IOAPIC that manages this GSI. */
9553 +       for (i = 0; i < nr_ioapics; i++) {
9554 +               if ((gsi >= mp_ioapic_routing[i].gsi_base)
9555 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
9556 +                       return i;
9557 +       }
9558 +
9559 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9560 +
9561 +       return -1;
9562 +}
9563 +
9564 +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
9565 +{
9566 +       int idx = 0;
9567 +       int tmpid;
9568 +
9569 +       if (nr_ioapics >= MAX_IO_APICS) {
9570 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
9571 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
9572 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
9573 +       }
9574 +       if (!address) {
9575 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
9576 +                       " found in MADT table, skipping!\n");
9577 +               return;
9578 +       }
9579 +
9580 +       idx = nr_ioapics++;
9581 +
9582 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
9583 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9584 +       mp_ioapics[idx].mpc_apicaddr = address;
9585 +
9586 +#ifndef CONFIG_XEN
9587 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9588 +#endif
9589 +       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
9590 +               && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9591 +               tmpid = io_apic_get_unique_id(idx, id);
9592 +       else
9593 +               tmpid = id;
9594 +       if (tmpid == -1) {
9595 +               nr_ioapics--;
9596 +               return;
9597 +       }
9598 +       mp_ioapics[idx].mpc_apicid = tmpid;
9599 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9600 +       
9601 +       /* 
9602 +        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9603 +        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9604 +        */
9605 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9606 +       mp_ioapic_routing[idx].gsi_base = gsi_base;
9607 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
9608 +               io_apic_get_redir_entries(idx);
9609 +
9610 +       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
9611 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
9612 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9613 +               mp_ioapic_routing[idx].gsi_base,
9614 +               mp_ioapic_routing[idx].gsi_end);
9615 +}
9616 +
9617 +void __init
9618 +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9619 +{
9620 +       struct mpc_config_intsrc intsrc;
9621 +       int                     ioapic = -1;
9622 +       int                     pin = -1;
9623 +
9624 +       /* 
9625 +        * Convert 'gsi' to 'ioapic.pin'.
9626 +        */
9627 +       ioapic = mp_find_ioapic(gsi);
9628 +       if (ioapic < 0)
9629 +               return;
9630 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9631 +
9632 +       /*
9633 +        * TBD: This check is for faulty timer entries, where the override
9634 +        *      erroneously sets the trigger to level, resulting in a HUGE 
9635 +        *      increase of timer interrupts!
9636 +        */
9637 +       if ((bus_irq == 0) && (trigger == 3))
9638 +               trigger = 1;
9639 +
9640 +       intsrc.mpc_type = MP_INTSRC;
9641 +       intsrc.mpc_irqtype = mp_INT;
9642 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
9643 +       intsrc.mpc_srcbus = MP_ISA_BUS;
9644 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
9645 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
9646 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
9647 +
9648 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
9649 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
9650 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
9651 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
9652 +
9653 +       mp_irqs[mp_irq_entries] = intsrc;
9654 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
9655 +               panic("Max # of irq sources exceeded!\n");
9656 +}
9657 +
9658 +void __init mp_config_acpi_legacy_irqs (void)
9659 +{
9660 +       struct mpc_config_intsrc intsrc;
9661 +       int i = 0;
9662 +       int ioapic = -1;
9663 +
9664 +       /* 
9665 +        * Fabricate the legacy ISA bus (bus #31).
9666 +        */
9667 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9668 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9669 +
9670 +       /*
9671 +        * Older generations of ES7000 have no legacy identity mappings
9672 +        */
9673 +       if (es7000_plat == 1)
9674 +               return;
9675 +
9676 +       /* 
9677 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
9678 +        */
9679 +       ioapic = mp_find_ioapic(0);
9680 +       if (ioapic < 0)
9681 +               return;
9682 +
9683 +       intsrc.mpc_type = MP_INTSRC;
9684 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
9685 +       intsrc.mpc_srcbus = MP_ISA_BUS;
9686 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9687 +
9688 +       /* 
9689 +        * Use the default configuration for the IRQs 0-15.  Unless
9690 +        * overriden by (MADT) interrupt source override entries.
9691 +        */
9692 +       for (i = 0; i < 16; i++) {
9693 +               int idx;
9694 +
9695 +               for (idx = 0; idx < mp_irq_entries; idx++) {
9696 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
9697 +
9698 +                       /* Do we already have a mapping for this ISA IRQ? */
9699 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
9700 +                               break;
9701 +
9702 +                       /* Do we already have a mapping for this IOAPIC pin */
9703 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9704 +                               (irq->mpc_dstirq == i))
9705 +                               break;
9706 +               }
9707 +
9708 +               if (idx != mp_irq_entries) {
9709 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9710 +                       continue;                       /* IRQ already used */
9711 +               }
9712 +
9713 +               intsrc.mpc_irqtype = mp_INT;
9714 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
9715 +               intsrc.mpc_dstirq = i;
9716 +
9717 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
9718 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
9719 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
9720 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
9721 +                       intsrc.mpc_dstirq);
9722 +
9723 +               mp_irqs[mp_irq_entries] = intsrc;
9724 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
9725 +                       panic("Max # of irq sources exceeded!\n");
9726 +       }
9727 +}
9728 +
9729 +#define MAX_GSI_NUM    4096
9730 +
9731 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
9732 +{
9733 +       int ioapic = -1;
9734 +       int ioapic_pin = 0;
9735 +       int idx, bit = 0;
9736 +       static int pci_irq = 16;
9737 +       /*
9738 +        * Mapping between Global System Interrups, which
9739 +        * represent all possible interrupts, and IRQs
9740 +        * assigned to actual devices.
9741 +        */
9742 +       static int              gsi_to_irq[MAX_GSI_NUM];
9743 +
9744 +       /* Don't set up the ACPI SCI because it's already set up */
9745 +       if (acpi_fadt.sci_int == gsi)
9746 +               return gsi;
9747 +
9748 +       ioapic = mp_find_ioapic(gsi);
9749 +       if (ioapic < 0) {
9750 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
9751 +               return gsi;
9752 +       }
9753 +
9754 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9755 +
9756 +       if (ioapic_renumber_irq)
9757 +               gsi = ioapic_renumber_irq(ioapic, gsi);
9758 +
9759 +       /* 
9760 +        * Avoid pin reprogramming.  PRTs typically include entries  
9761 +        * with redundant pin->gsi mappings (but unique PCI devices);
9762 +        * we only program the IOAPIC on the first.
9763 +        */
9764 +       bit = ioapic_pin % 32;
9765 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
9766 +       if (idx > 3) {
9767 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
9768 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
9769 +                       ioapic_pin);
9770 +               return gsi;
9771 +       }
9772 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
9773 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
9774 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
9775 +               return gsi_to_irq[gsi];
9776 +       }
9777 +
9778 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
9779 +
9780 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
9781 +               /*
9782 +                * For PCI devices assign IRQs in order, avoiding gaps
9783 +                * due to unused I/O APIC pins.
9784 +                */
9785 +               int irq = gsi;
9786 +               if (gsi < MAX_GSI_NUM) {
9787 +                       /*
9788 +                        * Retain the VIA chipset work-around (gsi > 15), but
9789 +                        * avoid a problem where the 8254 timer (IRQ0) is setup
9790 +                        * via an override (so it's not on pin 0 of the ioapic),
9791 +                        * and at the same time, the pin 0 interrupt is a PCI
9792 +                        * type.  The gsi > 15 test could cause these two pins
9793 +                        * to be shared as IRQ0, and they are not shareable.
9794 +                        * So test for this condition, and if necessary, avoid
9795 +                        * the pin collision.
9796 +                        */
9797 +                       if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
9798 +                               gsi = pci_irq++;
9799 +                       /*
9800 +                        * Don't assign IRQ used by ACPI SCI
9801 +                        */
9802 +                       if (gsi == acpi_fadt.sci_int)
9803 +                               gsi = pci_irq++;
9804 +                       gsi_to_irq[irq] = gsi;
9805 +               } else {
9806 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
9807 +                       return gsi;
9808 +               }
9809 +       }
9810 +
9811 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
9812 +                   triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
9813 +                   polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
9814 +       return gsi;
9815 +}
9816 +
9817 +#endif /* CONFIG_X86_IO_APIC */
9818 +#endif /* CONFIG_ACPI */
9819 diff -ruNp linux-2.6.19/arch/i386/kernel/pci-dma-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/pci-dma-xen.c
9820 --- linux-2.6.19/arch/i386/kernel/pci-dma-xen.c 1970-01-01 00:00:00.000000000 +0000
9821 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/pci-dma-xen.c       2007-02-02 19:10:21.000000000 +0000
9822 @@ -0,0 +1,377 @@
9823 +/*
9824 + * Dynamic DMA mapping support.
9825 + *
9826 + * On i386 there is no hardware dynamic DMA address translation,
9827 + * so consistent alloc/free are merely page allocation/freeing.
9828 + * The rest of the dynamic DMA mapping interface is implemented
9829 + * in asm/pci.h.
9830 + */
9831 +
9832 +#include <linux/types.h>
9833 +#include <linux/mm.h>
9834 +#include <linux/string.h>
9835 +#include <linux/pci.h>
9836 +#include <linux/module.h>
9837 +#include <linux/version.h>
9838 +#include <asm/io.h>
9839 +#include <xen/balloon.h>
9840 +#include <asm/swiotlb.h>
9841 +#include <asm/tlbflush.h>
9842 +#include <asm-i386/mach-xen/asm/swiotlb.h>
9843 +#include <asm/bug.h>
9844 +
9845 +#ifdef __x86_64__
9846 +int iommu_merge __read_mostly = 0;
9847 +EXPORT_SYMBOL(iommu_merge);
9848 +
9849 +dma_addr_t bad_dma_address __read_mostly;
9850 +EXPORT_SYMBOL(bad_dma_address);
9851 +
9852 +/* This tells the BIO block layer to assume merging. Default to off
9853 +   because we cannot guarantee merging later. */
9854 +int iommu_bio_merge __read_mostly = 0;
9855 +EXPORT_SYMBOL(iommu_bio_merge);
9856 +
9857 +int iommu_sac_force __read_mostly = 0;
9858 +EXPORT_SYMBOL(iommu_sac_force);
9859 +
9860 +int no_iommu __read_mostly;
9861 +#ifdef CONFIG_IOMMU_DEBUG
9862 +int panic_on_overflow __read_mostly = 1;
9863 +int force_iommu __read_mostly = 1;
9864 +#else
9865 +int panic_on_overflow __read_mostly = 0;
9866 +int force_iommu __read_mostly= 0;
9867 +#endif
9868 +
9869 +/* Set this to 1 if there is a HW IOMMU in the system */
9870 +int iommu_detected __read_mostly = 0;
9871 +
9872 +void __init pci_iommu_alloc(void)
9873 +{
9874 +       /*
9875 +        * The order of these functions is important for
9876 +        * fall-back/fail-over reasons
9877 +        */
9878 +#ifdef CONFIG_IOMMU
9879 +       iommu_hole_init();
9880 +#endif
9881 +
9882 +#ifdef CONFIG_CALGARY_IOMMU
9883 +#include <asm/calgary.h>
9884 +       detect_calgary();
9885 +#endif
9886 +
9887 +#ifdef CONFIG_SWIOTLB
9888 +       pci_swiotlb_init();
9889 +#endif
9890 +}
9891 +
9892 +__init int iommu_setup(char *p)
9893 +{
9894 +    return 1;
9895 +}
9896 +#endif
9897 +
9898 +struct dma_coherent_mem {
9899 +       void            *virt_base;
9900 +       u32             device_base;
9901 +       int             size;
9902 +       int             flags;
9903 +       unsigned long   *bitmap;
9904 +};
9905 +
9906 +#define IOMMU_BUG_ON(test)                             \
9907 +do {                                                   \
9908 +       if (unlikely(test)) {                           \
9909 +               printk(KERN_ALERT "Fatal DMA error! "   \
9910 +                      "Please use 'swiotlb=force'\n"); \
9911 +               BUG();                                  \
9912 +       }                                               \
9913 +} while (0)
9914 +
9915 +int
9916 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
9917 +          enum dma_data_direction direction)
9918 +{
9919 +       int i, rc;
9920 +
9921 +       BUG_ON(!valid_dma_direction(direction));
9922 +       WARN_ON(nents == 0 || sg[0].length == 0);
9923 +
9924 +       if (swiotlb) {
9925 +               rc = swiotlb_map_sg(hwdev, sg, nents, direction);
9926 +       } else {
9927 +               for (i = 0; i < nents; i++ ) {
9928 +                       sg[i].dma_address =
9929 +                               page_to_bus(sg[i].page) + sg[i].offset;
9930 +                       sg[i].dma_length  = sg[i].length;
9931 +                       BUG_ON(!sg[i].page);
9932 +                       IOMMU_BUG_ON(address_needs_mapping(
9933 +                               hwdev, sg[i].dma_address));
9934 +               }
9935 +               rc = nents;
9936 +       }
9937 +
9938 +       flush_write_buffers();
9939 +       return rc;
9940 +}
9941 +EXPORT_SYMBOL(dma_map_sg);
9942 +
9943 +void
9944 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
9945 +            enum dma_data_direction direction)
9946 +{
9947 +       BUG_ON(!valid_dma_direction(direction));
9948 +       if (swiotlb)
9949 +               swiotlb_unmap_sg(hwdev, sg, nents, direction);
9950 +}
9951 +EXPORT_SYMBOL(dma_unmap_sg);
9952 +
9953 +/*
9954 + * XXX This file is also used by xenLinux/ia64. 
9955 + * "defined(__i386__) || defined (__x86_64__)" means "!defined(__ia64__)".
9956 + * This #if work around should be removed once this file is merbed back into
9957 + * i386' pci-dma or is moved to drivers/xen/core.
9958 + */
9959 +#if defined(__i386__) || defined(__x86_64__)
9960 +dma_addr_t
9961 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
9962 +            size_t size, enum dma_data_direction direction)
9963 +{
9964 +       dma_addr_t dma_addr;
9965 +
9966 +       BUG_ON(!valid_dma_direction(direction));
9967 +
9968 +       if (swiotlb) {
9969 +               dma_addr = swiotlb_map_page(
9970 +                       dev, page, offset, size, direction);
9971 +       } else {
9972 +               dma_addr = page_to_bus(page) + offset;
9973 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
9974 +       }
9975 +
9976 +       return dma_addr;
9977 +}
9978 +EXPORT_SYMBOL(dma_map_page);
9979 +
9980 +void
9981 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
9982 +              enum dma_data_direction direction)
9983 +{
9984 +       BUG_ON(!valid_dma_direction(direction));
9985 +       if (swiotlb)
9986 +               swiotlb_unmap_page(dev, dma_address, size, direction);
9987 +}
9988 +EXPORT_SYMBOL(dma_unmap_page);
9989 +#endif /* defined(__i386__) || defined(__x86_64__) */
9990 +
9991 +int
9992 +dma_mapping_error(dma_addr_t dma_addr)
9993 +{
9994 +       if (swiotlb)
9995 +               return swiotlb_dma_mapping_error(dma_addr);
9996 +       return 0;
9997 +}
9998 +EXPORT_SYMBOL(dma_mapping_error);
9999 +
10000 +int
10001 +dma_supported(struct device *dev, u64 mask)
10002 +{
10003 +       if (swiotlb)
10004 +               return swiotlb_dma_supported(dev, mask);
10005 +       /*
10006 +        * By default we'll BUG when an infeasible DMA is requested, and
10007 +        * request swiotlb=force (see IOMMU_BUG_ON).
10008 +        */
10009 +       return 1;
10010 +}
10011 +EXPORT_SYMBOL(dma_supported);
10012 +
10013 +void *dma_alloc_coherent(struct device *dev, size_t size,
10014 +                          dma_addr_t *dma_handle, gfp_t gfp)
10015 +{
10016 +       void *ret;
10017 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10018 +       unsigned int order = get_order(size);
10019 +       unsigned long vstart;
10020 +       /* ignore region specifiers */
10021 +       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
10022 +
10023 +       if (mem) {
10024 +               int page = bitmap_find_free_region(mem->bitmap, mem->size,
10025 +                                                    order);
10026 +               if (page >= 0) {
10027 +                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10028 +                       ret = mem->virt_base + (page << PAGE_SHIFT);
10029 +                       memset(ret, 0, size);
10030 +                       return ret;
10031 +               }
10032 +               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10033 +                       return NULL;
10034 +       }
10035 +
10036 +       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
10037 +               gfp |= GFP_DMA;
10038 +
10039 +       vstart = __get_free_pages(gfp, order);
10040 +       ret = (void *)vstart;
10041 +
10042 +       if (ret != NULL) {
10043 +               if (xen_create_contiguous_region(vstart, order,
10044 +                                                dma_bits) != 0) {
10045 +                       free_pages(vstart, order);
10046 +                       return NULL;
10047 +               }
10048 +               memset(ret, 0, size);
10049 +               *dma_handle = virt_to_bus(ret);
10050 +       }
10051 +       return ret;
10052 +}
10053 +EXPORT_SYMBOL(dma_alloc_coherent);
10054 +
10055 +void dma_free_coherent(struct device *dev, size_t size,
10056 +                        void *vaddr, dma_addr_t dma_handle)
10057 +{
10058 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10059 +       int order = get_order(size);
10060 +       
10061 +       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10062 +               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10063 +
10064 +               bitmap_release_region(mem->bitmap, page, order);
10065 +       } else {
10066 +               xen_destroy_contiguous_region((unsigned long)vaddr, order);
10067 +               free_pages((unsigned long)vaddr, order);
10068 +       }
10069 +}
10070 +EXPORT_SYMBOL(dma_free_coherent);
10071 +
10072 +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
10073 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10074 +                               dma_addr_t device_addr, size_t size, int flags)
10075 +{
10076 +       void __iomem *mem_base;
10077 +       int pages = size >> PAGE_SHIFT;
10078 +       int bitmap_size = (pages + 31)/32;
10079 +
10080 +       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10081 +               goto out;
10082 +       if (!size)
10083 +               goto out;
10084 +       if (dev->dma_mem)
10085 +               goto out;
10086 +
10087 +       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10088 +
10089 +       mem_base = ioremap(bus_addr, size);
10090 +       if (!mem_base)
10091 +               goto out;
10092 +
10093 +       dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10094 +       if (!dev->dma_mem)
10095 +               goto out;
10096 +       memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
10097 +       dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
10098 +       if (!dev->dma_mem->bitmap)
10099 +               goto free1_out;
10100 +       memset(dev->dma_mem->bitmap, 0, bitmap_size);
10101 +
10102 +       dev->dma_mem->virt_base = mem_base;
10103 +       dev->dma_mem->device_base = device_addr;
10104 +       dev->dma_mem->size = pages;
10105 +       dev->dma_mem->flags = flags;
10106 +
10107 +       if (flags & DMA_MEMORY_MAP)
10108 +               return DMA_MEMORY_MAP;
10109 +
10110 +       return DMA_MEMORY_IO;
10111 +
10112 + free1_out:
10113 +       kfree(dev->dma_mem->bitmap);
10114 + out:
10115 +       return 0;
10116 +}
10117 +EXPORT_SYMBOL(dma_declare_coherent_memory);
10118 +
10119 +void dma_release_declared_memory(struct device *dev)
10120 +{
10121 +       struct dma_coherent_mem *mem = dev->dma_mem;
10122 +       
10123 +       if(!mem)
10124 +               return;
10125 +       dev->dma_mem = NULL;
10126 +       iounmap(mem->virt_base);
10127 +       kfree(mem->bitmap);
10128 +       kfree(mem);
10129 +}
10130 +EXPORT_SYMBOL(dma_release_declared_memory);
10131 +
10132 +void *dma_mark_declared_memory_occupied(struct device *dev,
10133 +                                       dma_addr_t device_addr, size_t size)
10134 +{
10135 +       struct dma_coherent_mem *mem = dev->dma_mem;
10136 +       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
10137 +       int pos, err;
10138 +
10139 +       if (!mem)
10140 +               return ERR_PTR(-EINVAL);
10141 +
10142 +       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10143 +       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10144 +       if (err != 0)
10145 +               return ERR_PTR(err);
10146 +       return mem->virt_base + (pos << PAGE_SHIFT);
10147 +}
10148 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10149 +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
10150 +
10151 +dma_addr_t
10152 +dma_map_single(struct device *dev, void *ptr, size_t size,
10153 +              enum dma_data_direction direction)
10154 +{
10155 +       dma_addr_t dma;
10156 +
10157 +       BUG_ON(!valid_dma_direction(direction));
10158 +       WARN_ON(size == 0);
10159 +
10160 +       if (swiotlb) {
10161 +               dma = swiotlb_map_single(dev, ptr, size, direction);
10162 +       } else {
10163 +               dma = virt_to_bus(ptr);
10164 +               IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size));
10165 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma));
10166 +       }
10167 +
10168 +       flush_write_buffers();
10169 +       return dma;
10170 +}
10171 +EXPORT_SYMBOL(dma_map_single);
10172 +
10173 +void
10174 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
10175 +                enum dma_data_direction direction)
10176 +{
10177 +       BUG_ON(!valid_dma_direction(direction));
10178 +       if (swiotlb)
10179 +               swiotlb_unmap_single(dev, dma_addr, size, direction);
10180 +}
10181 +EXPORT_SYMBOL(dma_unmap_single);
10182 +
10183 +void
10184 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
10185 +                       enum dma_data_direction direction)
10186 +{
10187 +       if (swiotlb)
10188 +               swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
10189 +}
10190 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
10191 +
10192 +void
10193 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
10194 +                           enum dma_data_direction direction)
10195 +{
10196 +       if (swiotlb)
10197 +               swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
10198 +}
10199 +EXPORT_SYMBOL(dma_sync_single_for_device);
10200 diff -ruNp linux-2.6.19/arch/i386/kernel/process-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/process-xen.c
10201 --- linux-2.6.19/arch/i386/kernel/process-xen.c 1970-01-01 00:00:00.000000000 +0000
10202 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/process-xen.c       2007-02-02 19:10:21.000000000 +0000
10203 @@ -0,0 +1,852 @@
10204 +/*
10205 + *  linux/arch/i386/kernel/process.c
10206 + *
10207 + *  Copyright (C) 1995  Linus Torvalds
10208 + *
10209 + *  Pentium III FXSR, SSE support
10210 + *     Gareth Hughes <gareth@valinux.com>, May 2000
10211 + */
10212 +
10213 +/*
10214 + * This file handles the architecture-dependent parts of process handling..
10215 + */
10216 +
10217 +#include <stdarg.h>
10218 +
10219 +#include <linux/cpu.h>
10220 +#include <linux/errno.h>
10221 +#include <linux/sched.h>
10222 +#include <linux/fs.h>
10223 +#include <linux/kernel.h>
10224 +#include <linux/mm.h>
10225 +#include <linux/elfcore.h>
10226 +#include <linux/smp.h>
10227 +#include <linux/smp_lock.h>
10228 +#include <linux/stddef.h>
10229 +#include <linux/slab.h>
10230 +#include <linux/vmalloc.h>
10231 +#include <linux/user.h>
10232 +#include <linux/a.out.h>
10233 +#include <linux/interrupt.h>
10234 +#include <linux/utsname.h>
10235 +#include <linux/delay.h>
10236 +#include <linux/reboot.h>
10237 +#include <linux/init.h>
10238 +#include <linux/mc146818rtc.h>
10239 +#include <linux/module.h>
10240 +#include <linux/kallsyms.h>
10241 +#include <linux/ptrace.h>
10242 +#include <linux/random.h>
10243 +#include <linux/personality.h>
10244 +
10245 +#include <asm/uaccess.h>
10246 +#include <asm/pgtable.h>
10247 +#include <asm/system.h>
10248 +#include <asm/io.h>
10249 +#include <asm/ldt.h>
10250 +#include <asm/processor.h>
10251 +#include <asm/i387.h>
10252 +#include <asm/desc.h>
10253 +#include <asm/vm86.h>
10254 +#ifdef CONFIG_MATH_EMULATION
10255 +#include <asm/math_emu.h>
10256 +#endif
10257 +
10258 +#include <xen/interface/physdev.h>
10259 +#include <xen/interface/vcpu.h>
10260 +#include <xen/cpu_hotplug.h>
10261 +
10262 +#include <linux/err.h>
10263 +
10264 +#include <asm/tlbflush.h>
10265 +#include <asm/cpu.h>
10266 +
10267 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10268 +
10269 +static int hlt_counter;
10270 +
10271 +unsigned long boot_option_idle_override = 0;
10272 +EXPORT_SYMBOL(boot_option_idle_override);
10273 +
10274 +/*
10275 + * Return saved PC of a blocked thread.
10276 + */
10277 +unsigned long thread_saved_pc(struct task_struct *tsk)
10278 +{
10279 +       return ((unsigned long *)tsk->thread.esp)[3];
10280 +}
10281 +
10282 +/*
10283 + * Powermanagement idle function, if any..
10284 + */
10285 +void (*pm_idle)(void);
10286 +EXPORT_SYMBOL(pm_idle);
10287 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
10288 +
10289 +void disable_hlt(void)
10290 +{
10291 +       hlt_counter++;
10292 +}
10293 +
10294 +EXPORT_SYMBOL(disable_hlt);
10295 +
10296 +void enable_hlt(void)
10297 +{
10298 +       hlt_counter--;
10299 +}
10300 +
10301 +EXPORT_SYMBOL(enable_hlt);
10302 +
10303 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
10304 +void xen_idle(void)
10305 +{
10306 +       local_irq_disable();
10307 +
10308 +       if (need_resched())
10309 +               local_irq_enable();
10310 +       else {
10311 +               current_thread_info()->status &= ~TS_POLLING;
10312 +               /*
10313 +                * TS_POLLING-cleared state must be visible before we
10314 +                * test NEED_RESCHED:
10315 +                */
10316 +               smp_mb();
10317 +
10318 +               safe_halt();
10319 +               current_thread_info()->status |= TS_POLLING;
10320 +       }
10321 +}
10322 +#ifdef CONFIG_APM_MODULE
10323 +EXPORT_SYMBOL(default_idle);
10324 +#endif
10325 +
10326 +#ifdef CONFIG_HOTPLUG_CPU
10327 +extern cpumask_t cpu_initialized;
10328 +static inline void play_dead(void)
10329 +{
10330 +       idle_task_exit();
10331 +       local_irq_disable();
10332 +       cpu_clear(smp_processor_id(), cpu_initialized);
10333 +       preempt_enable_no_resched();
10334 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
10335 +       cpu_bringup();
10336 +}
10337 +#else
10338 +static inline void play_dead(void)
10339 +{
10340 +       BUG();
10341 +}
10342 +#endif /* CONFIG_HOTPLUG_CPU */
10343 +
10344 +/*
10345 + * The idle thread. There's no useful work to be
10346 + * done, so just try to conserve power and have a
10347 + * low exit latency (ie sit in a loop waiting for
10348 + * somebody to say that they'd like to reschedule)
10349 + */
10350 +void cpu_idle(void)
10351 +{
10352 +       int cpu = smp_processor_id();
10353 +
10354 +       current_thread_info()->status |= TS_POLLING;
10355 +
10356 +       /* endless idle loop with no priority at all */
10357 +       while (1) {
10358 +               while (!need_resched()) {
10359 +
10360 +                       if (__get_cpu_var(cpu_idle_state))
10361 +                               __get_cpu_var(cpu_idle_state) = 0;
10362 +
10363 +                       rmb();
10364 +
10365 +                       if (cpu_is_offline(cpu))
10366 +                               play_dead();
10367 +
10368 +                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10369 +                       xen_idle();
10370 +               }
10371 +               preempt_enable_no_resched();
10372 +               schedule();
10373 +               preempt_disable();
10374 +       }
10375 +}
10376 +
10377 +void cpu_idle_wait(void)
10378 +{
10379 +       unsigned int cpu, this_cpu = get_cpu();
10380 +       cpumask_t map, tmp = current->cpus_allowed;
10381 +
10382 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
10383 +       put_cpu();
10384 +
10385 +       cpus_clear(map);
10386 +       for_each_online_cpu(cpu) {
10387 +               per_cpu(cpu_idle_state, cpu) = 1;
10388 +               cpu_set(cpu, map);
10389 +       }
10390 +
10391 +       __get_cpu_var(cpu_idle_state) = 0;
10392 +
10393 +       wmb();
10394 +       do {
10395 +               ssleep(1);
10396 +               for_each_online_cpu(cpu) {
10397 +                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
10398 +                               cpu_clear(cpu, map);
10399 +               }
10400 +               cpus_and(map, map, cpu_online_map);
10401 +       } while (!cpus_empty(map));
10402 +
10403 +       set_cpus_allowed(current, tmp);
10404 +}
10405 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
10406 +
10407 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
10408 +/* Always use xen_idle() instead. */
10409 +void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) {}
10410 +
10411 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) {}
10412 +
10413 +void show_regs(struct pt_regs * regs)
10414 +{
10415 +       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
10416 +
10417 +       printk("\n");
10418 +       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
10419 +       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
10420 +       print_symbol("EIP is at %s\n", regs->eip);
10421 +
10422 +       if (user_mode_vm(regs))
10423 +               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
10424 +       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
10425 +              regs->eflags, print_tainted(), init_utsname()->release,
10426 +              (int)strcspn(init_utsname()->version, " "),
10427 +              init_utsname()->version);
10428 +       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
10429 +               regs->eax,regs->ebx,regs->ecx,regs->edx);
10430 +       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
10431 +               regs->esi, regs->edi, regs->ebp);
10432 +       printk(" DS: %04x ES: %04x\n",
10433 +               0xffff & regs->xds,0xffff & regs->xes);
10434 +
10435 +       cr0 = read_cr0();
10436 +       cr2 = read_cr2();
10437 +       cr3 = read_cr3();
10438 +       cr4 = read_cr4_safe();
10439 +       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
10440 +       show_trace(NULL, regs, &regs->esp);
10441 +}
10442 +
10443 +/*
10444 + * This gets run with %ebx containing the
10445 + * function to call, and %edx containing
10446 + * the "args".
10447 + */
10448 +extern void kernel_thread_helper(void);
10449 +
10450 +/*
10451 + * Create a kernel thread
10452 + */
10453 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
10454 +{
10455 +       struct pt_regs regs;
10456 +
10457 +       memset(&regs, 0, sizeof(regs));
10458 +
10459 +       regs.ebx = (unsigned long) fn;
10460 +       regs.edx = (unsigned long) arg;
10461 +
10462 +       regs.xds = __USER_DS;
10463 +       regs.xes = __USER_DS;
10464 +       regs.orig_eax = -1;
10465 +       regs.eip = (unsigned long) kernel_thread_helper;
10466 +       regs.xcs = __KERNEL_CS | get_kernel_rpl();
10467 +       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
10468 +
10469 +       /* Ok, create the new process.. */
10470 +       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
10471 +}
10472 +EXPORT_SYMBOL(kernel_thread);
10473 +
10474 +/*
10475 + * Free current thread data structures etc..
10476 + */
10477 +void exit_thread(void)
10478 +{
10479 +       /* The process may have allocated an io port bitmap... nuke it. */
10480 +       if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
10481 +               struct task_struct *tsk = current;
10482 +               struct thread_struct *t = &tsk->thread;
10483 +
10484 +               struct physdev_set_iobitmap set_iobitmap = { 0 };
10485 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
10486 +               kfree(t->io_bitmap_ptr);
10487 +               t->io_bitmap_ptr = NULL;
10488 +               clear_thread_flag(TIF_IO_BITMAP);
10489 +       }
10490 +}
10491 +
10492 +void flush_thread(void)
10493 +{
10494 +       struct task_struct *tsk = current;
10495 +
10496 +       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
10497 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
10498 +       clear_tsk_thread_flag(tsk, TIF_DEBUG);
10499 +       /*
10500 +        * Forget coprocessor state..
10501 +        */
10502 +       clear_fpu(tsk);
10503 +       clear_used_math();
10504 +}
10505 +
10506 +void release_thread(struct task_struct *dead_task)
10507 +{
10508 +       BUG_ON(dead_task->mm);
10509 +       release_vm86_irqs(dead_task);
10510 +}
10511 +
10512 +/*
10513 + * This gets called before we allocate a new thread and copy
10514 + * the current task into it.
10515 + */
10516 +void prepare_to_copy(struct task_struct *tsk)
10517 +{
10518 +       unlazy_fpu(tsk);
10519 +}
10520 +
10521 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
10522 +       unsigned long unused,
10523 +       struct task_struct * p, struct pt_regs * regs)
10524 +{
10525 +       struct pt_regs * childregs;
10526 +       struct task_struct *tsk;
10527 +       int err;
10528 +
10529 +       childregs = task_pt_regs(p);
10530 +       *childregs = *regs;
10531 +       childregs->eax = 0;
10532 +       childregs->esp = esp;
10533 +
10534 +       p->thread.esp = (unsigned long) childregs;
10535 +       p->thread.esp0 = (unsigned long) (childregs+1);
10536 +
10537 +       p->thread.eip = (unsigned long) ret_from_fork;
10538 +
10539 +       savesegment(fs,p->thread.fs);
10540 +       savesegment(gs,p->thread.gs);
10541 +
10542 +       tsk = current;
10543 +       if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
10544 +               p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
10545 +                                               IO_BITMAP_BYTES, GFP_KERNEL);
10546 +               if (!p->thread.io_bitmap_ptr) {
10547 +                       p->thread.io_bitmap_max = 0;
10548 +                       return -ENOMEM;
10549 +               }
10550 +               set_tsk_thread_flag(p, TIF_IO_BITMAP);
10551 +       }
10552 +
10553 +       /*
10554 +        * Set a new TLS for the child thread?
10555 +        */
10556 +       if (clone_flags & CLONE_SETTLS) {
10557 +               struct desc_struct *desc;
10558 +               struct user_desc info;
10559 +               int idx;
10560 +
10561 +               err = -EFAULT;
10562 +               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
10563 +                       goto out;
10564 +               err = -EINVAL;
10565 +               if (LDT_empty(&info))
10566 +                       goto out;
10567 +
10568 +               idx = info.entry_number;
10569 +               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10570 +                       goto out;
10571 +
10572 +               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
10573 +               desc->a = LDT_entry_a(&info);
10574 +               desc->b = LDT_entry_b(&info);
10575 +       }
10576 +
10577 +       p->thread.iopl = current->thread.iopl;
10578 +
10579 +       err = 0;
10580 + out:
10581 +       if (err && p->thread.io_bitmap_ptr) {
10582 +               kfree(p->thread.io_bitmap_ptr);
10583 +               p->thread.io_bitmap_max = 0;
10584 +       }
10585 +       return err;
10586 +}
10587 +
10588 +/*
10589 + * fill in the user structure for a core dump..
10590 + */
10591 +void dump_thread(struct pt_regs * regs, struct user * dump)
10592 +{
10593 +       int i;
10594 +
10595 +/* changed the size calculations - should hopefully work better. lbt */
10596 +       dump->magic = CMAGIC;
10597 +       dump->start_code = 0;
10598 +       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
10599 +       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
10600 +       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
10601 +       dump->u_dsize -= dump->u_tsize;
10602 +       dump->u_ssize = 0;
10603 +       for (i = 0; i < 8; i++)
10604 +               dump->u_debugreg[i] = current->thread.debugreg[i];  
10605 +
10606 +       if (dump->start_stack < TASK_SIZE)
10607 +               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
10608 +
10609 +       dump->regs.ebx = regs->ebx;
10610 +       dump->regs.ecx = regs->ecx;
10611 +       dump->regs.edx = regs->edx;
10612 +       dump->regs.esi = regs->esi;
10613 +       dump->regs.edi = regs->edi;
10614 +       dump->regs.ebp = regs->ebp;
10615 +       dump->regs.eax = regs->eax;
10616 +       dump->regs.ds = regs->xds;
10617 +       dump->regs.es = regs->xes;
10618 +       savesegment(fs,dump->regs.fs);
10619 +       savesegment(gs,dump->regs.gs);
10620 +       dump->regs.orig_eax = regs->orig_eax;
10621 +       dump->regs.eip = regs->eip;
10622 +       dump->regs.cs = regs->xcs;
10623 +       dump->regs.eflags = regs->eflags;
10624 +       dump->regs.esp = regs->esp;
10625 +       dump->regs.ss = regs->xss;
10626 +
10627 +       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
10628 +}
10629 +EXPORT_SYMBOL(dump_thread);
10630 +
10631 +/* 
10632 + * Capture the user space registers if the task is not running (in user space)
10633 + */
10634 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
10635 +{
10636 +       struct pt_regs ptregs = *task_pt_regs(tsk);
10637 +       ptregs.xcs &= 0xffff;
10638 +       ptregs.xds &= 0xffff;
10639 +       ptregs.xes &= 0xffff;
10640 +       ptregs.xss &= 0xffff;
10641 +
10642 +       elf_core_copy_regs(regs, &ptregs);
10643 +
10644 +       return 1;
10645 +}
10646 +
10647 +static noinline void __switch_to_xtra(struct task_struct *next_p)
10648 +{
10649 +       struct thread_struct *next;
10650 +
10651 +       next = &next_p->thread;
10652 +
10653 +       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
10654 +               set_debugreg(next->debugreg[0], 0);
10655 +               set_debugreg(next->debugreg[1], 1);
10656 +               set_debugreg(next->debugreg[2], 2);
10657 +               set_debugreg(next->debugreg[3], 3);
10658 +               /* no 4 and 5 */
10659 +               set_debugreg(next->debugreg[6], 6);
10660 +               set_debugreg(next->debugreg[7], 7);
10661 +       }
10662 +#ifndef CONFIG_XEN
10663 +       if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
10664 +               /*
10665 +                * Disable the bitmap via an invalid offset. We still cache
10666 +                * the previous bitmap owner and the IO bitmap contents:
10667 +                */
10668 +               tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
10669 +               return;
10670 +       }
10671 +
10672 +       if (likely(next == tss->io_bitmap_owner)) {
10673 +               /*
10674 +                * Previous owner of the bitmap (hence the bitmap content)
10675 +                * matches the next task, we dont have to do anything but
10676 +                * to set a valid offset in the TSS:
10677 +                */
10678 +               tss->io_bitmap_base = IO_BITMAP_OFFSET;
10679 +               return;
10680 +       }
10681 +       /*
10682 +        * Lazy TSS's I/O bitmap copy. We set an invalid offset here
10683 +        * and we let the task to get a GPF in case an I/O instruction
10684 +        * is performed.  The handler of the GPF will verify that the
10685 +        * faulting task has a valid I/O bitmap and, it true, does the
10686 +        * real copy and restart the instruction.  This will save us
10687 +        * redundant copies when the currently switched task does not
10688 +        * perform any I/O during its timeslice.
10689 +        */
10690 +       tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
10691 +#endif /* !CONFIG_XEN */
10692 +}
10693 +
10694 +/*
10695 + * This function selects if the context switch from prev to next
10696 + * has to tweak the TSC disable bit in the cr4.
10697 + */
10698 +static inline void disable_tsc(struct task_struct *prev_p,
10699 +                              struct task_struct *next_p)
10700 +{
10701 +       struct thread_info *prev, *next;
10702 +
10703 +       /*
10704 +        * gcc should eliminate the ->thread_info dereference if
10705 +        * has_secure_computing returns 0 at compile time (SECCOMP=n).
10706 +        */
10707 +       prev = task_thread_info(prev_p);
10708 +       next = task_thread_info(next_p);
10709 +
10710 +       if (has_secure_computing(prev) || has_secure_computing(next)) {
10711 +               /* slow path here */
10712 +               if (has_secure_computing(prev) &&
10713 +                   !has_secure_computing(next)) {
10714 +                       write_cr4(read_cr4() & ~X86_CR4_TSD);
10715 +               } else if (!has_secure_computing(prev) &&
10716 +                          has_secure_computing(next))
10717 +                       write_cr4(read_cr4() | X86_CR4_TSD);
10718 +       }
10719 +}
10720 +
10721 +/*
10722 + *     switch_to(x,yn) should switch tasks from x to y.
10723 + *
10724 + * We fsave/fwait so that an exception goes off at the right time
10725 + * (as a call from the fsave or fwait in effect) rather than to
10726 + * the wrong process. Lazy FP saving no longer makes any sense
10727 + * with modern CPU's, and this simplifies a lot of things (SMP
10728 + * and UP become the same).
10729 + *
10730 + * NOTE! We used to use the x86 hardware context switching. The
10731 + * reason for not using it any more becomes apparent when you
10732 + * try to recover gracefully from saved state that is no longer
10733 + * valid (stale segment register values in particular). With the
10734 + * hardware task-switch, there is no way to fix up bad state in
10735 + * a reasonable manner.
10736 + *
10737 + * The fact that Intel documents the hardware task-switching to
10738 + * be slow is a fairly red herring - this code is not noticeably
10739 + * faster. However, there _is_ some room for improvement here,
10740 + * so the performance issues may eventually be a valid point.
10741 + * More important, however, is the fact that this allows us much
10742 + * more flexibility.
10743 + *
10744 + * The return value (in %eax) will be the "prev" task after
10745 + * the task-switch, and shows up in ret_from_fork in entry.S,
10746 + * for example.
10747 + */
10748 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
10749 +{
10750 +       struct thread_struct *prev = &prev_p->thread,
10751 +                                *next = &next_p->thread;
10752 +       int cpu = smp_processor_id();
10753 +#ifndef CONFIG_X86_NO_TSS
10754 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
10755 +#endif
10756 +       struct physdev_set_iopl iopl_op;
10757 +       struct physdev_set_iobitmap iobmp_op;
10758 +       multicall_entry_t _mcl[8], *mcl = _mcl;
10759 +
10760 +       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
10761 +
10762 +       /*
10763 +        * This is basically '__unlazy_fpu', except that we queue a
10764 +        * multicall to indicate FPU task switch, rather than
10765 +        * synchronously trapping to Xen.
10766 +        */
10767 +       if (prev_p->thread_info->status & TS_USEDFPU) {
10768 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
10769 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
10770 +               mcl->args[0] = 1;
10771 +               mcl++;
10772 +       }
10773 +#if 0 /* lazy fpu sanity check */
10774 +       else BUG_ON(!(read_cr0() & 8));
10775 +#endif
10776 +
10777 +       /*
10778 +        * Reload esp0.
10779 +        * This is load_esp0(tss, next) with a multicall.
10780 +        */
10781 +       mcl->op      = __HYPERVISOR_stack_switch;
10782 +       mcl->args[0] = __KERNEL_DS;
10783 +       mcl->args[1] = next->esp0;
10784 +       mcl++;
10785 +
10786 +       /*
10787 +        * Load the per-thread Thread-Local Storage descriptor.
10788 +        * This is load_TLS(next, cpu) with multicalls.
10789 +        */
10790 +#define C(i) do {                                                      \
10791 +       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
10792 +                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
10793 +               mcl->op = __HYPERVISOR_update_descriptor;               \
10794 +               *(u64 *)&mcl->args[0] = virt_to_machine(                \
10795 +                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
10796 +               *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];    \
10797 +               mcl++;                                                  \
10798 +       }                                                               \
10799 +} while (0)
10800 +       C(0); C(1); C(2);
10801 +#undef C
10802 +
10803 +       if (unlikely(prev->iopl != next->iopl)) {
10804 +               iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
10805 +               mcl->op      = __HYPERVISOR_physdev_op;
10806 +               mcl->args[0] = PHYSDEVOP_set_iopl;
10807 +               mcl->args[1] = (unsigned long)&iopl_op;
10808 +               mcl++;
10809 +       }
10810 +
10811 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
10812 +               iobmp_op.bitmap   = (char *)next->io_bitmap_ptr;
10813 +               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
10814 +               mcl->op      = __HYPERVISOR_physdev_op;
10815 +               mcl->args[0] = PHYSDEVOP_set_iobitmap;
10816 +               mcl->args[1] = (unsigned long)&iobmp_op;
10817 +               mcl++;
10818 +       }
10819 +
10820 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
10821 +
10822 +       /*
10823 +        * Restore %fs and %gs if needed.
10824 +        *
10825 +        * Glibc normally makes %fs be zero, and %gs is one of
10826 +        * the TLS segments.
10827 +        */
10828 +       if (unlikely(next->fs))
10829 +               loadsegment(fs, next->fs);
10830 +
10831 +       if (next->gs)
10832 +               loadsegment(gs, next->gs);
10833 +
10834 +       /*
10835 +        * Now maybe handle debug registers and/or IO bitmaps
10836 +        */
10837 +       if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
10838 +           || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
10839 +               __switch_to_xtra(next_p);
10840 +
10841 +       disable_tsc(prev_p, next_p);
10842 +
10843 +       return prev_p;
10844 +}
10845 +
10846 +asmlinkage int sys_fork(struct pt_regs regs)
10847 +{
10848 +       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
10849 +}
10850 +
10851 +asmlinkage int sys_clone(struct pt_regs regs)
10852 +{
10853 +       unsigned long clone_flags;
10854 +       unsigned long newsp;
10855 +       int __user *parent_tidptr, *child_tidptr;
10856 +
10857 +       clone_flags = regs.ebx;
10858 +       newsp = regs.ecx;
10859 +       parent_tidptr = (int __user *)regs.edx;
10860 +       child_tidptr = (int __user *)regs.edi;
10861 +       if (!newsp)
10862 +               newsp = regs.esp;
10863 +       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
10864 +}
10865 +
10866 +/*
10867 + * This is trivial, and on the face of it looks like it
10868 + * could equally well be done in user mode.
10869 + *
10870 + * Not so, for quite unobvious reasons - register pressure.
10871 + * In user mode vfork() cannot have a stack frame, and if
10872 + * done by calling the "clone()" system call directly, you
10873 + * do not have enough call-clobbered registers to hold all
10874 + * the information you need.
10875 + */
10876 +asmlinkage int sys_vfork(struct pt_regs regs)
10877 +{
10878 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
10879 +}
10880 +
10881 +/*
10882 + * sys_execve() executes a new program.
10883 + */
10884 +asmlinkage int sys_execve(struct pt_regs regs)
10885 +{
10886 +       int error;
10887 +       char * filename;
10888 +
10889 +       filename = getname((char __user *) regs.ebx);
10890 +       error = PTR_ERR(filename);
10891 +       if (IS_ERR(filename))
10892 +               goto out;
10893 +       error = do_execve(filename,
10894 +                       (char __user * __user *) regs.ecx,
10895 +                       (char __user * __user *) regs.edx,
10896 +                       &regs);
10897 +       if (error == 0) {
10898 +               task_lock(current);
10899 +               current->ptrace &= ~PT_DTRACE;
10900 +               task_unlock(current);
10901 +               /* Make sure we don't return using sysenter.. */
10902 +               set_thread_flag(TIF_IRET);
10903 +       }
10904 +       putname(filename);
10905 +out:
10906 +       return error;
10907 +}
10908 +
10909 +#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
10910 +#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
10911 +
10912 +unsigned long get_wchan(struct task_struct *p)
10913 +{
10914 +       unsigned long ebp, esp, eip;
10915 +       unsigned long stack_page;
10916 +       int count = 0;
10917 +       if (!p || p == current || p->state == TASK_RUNNING)
10918 +               return 0;
10919 +       stack_page = (unsigned long)task_stack_page(p);
10920 +       esp = p->thread.esp;
10921 +       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
10922 +               return 0;
10923 +       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
10924 +       ebp = *(unsigned long *) esp;
10925 +       do {
10926 +               if (ebp < stack_page || ebp > top_ebp+stack_page)
10927 +                       return 0;
10928 +               eip = *(unsigned long *) (ebp+4);
10929 +               if (!in_sched_functions(eip))
10930 +                       return eip;
10931 +               ebp = *(unsigned long *) ebp;
10932 +       } while (count++ < 16);
10933 +       return 0;
10934 +}
10935 +
10936 +/*
10937 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
10938 + */
10939 +static int get_free_idx(void)
10940 +{
10941 +       struct thread_struct *t = &current->thread;
10942 +       int idx;
10943 +
10944 +       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
10945 +               if (desc_empty(t->tls_array + idx))
10946 +                       return idx + GDT_ENTRY_TLS_MIN;
10947 +       return -ESRCH;
10948 +}
10949 +
10950 +/*
10951 + * Set a given TLS descriptor:
10952 + */
10953 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
10954 +{
10955 +       struct thread_struct *t = &current->thread;
10956 +       struct user_desc info;
10957 +       struct desc_struct *desc;
10958 +       int cpu, idx;
10959 +
10960 +       if (copy_from_user(&info, u_info, sizeof(info)))
10961 +               return -EFAULT;
10962 +       idx = info.entry_number;
10963 +
10964 +       /*
10965 +        * index -1 means the kernel should try to find and
10966 +        * allocate an empty descriptor:
10967 +        */
10968 +       if (idx == -1) {
10969 +               idx = get_free_idx();
10970 +               if (idx < 0)
10971 +                       return idx;
10972 +               if (put_user(idx, &u_info->entry_number))
10973 +                       return -EFAULT;
10974 +       }
10975 +
10976 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10977 +               return -EINVAL;
10978 +
10979 +       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
10980 +
10981 +       /*
10982 +        * We must not get preempted while modifying the TLS.
10983 +        */
10984 +       cpu = get_cpu();
10985 +
10986 +       if (LDT_empty(&info)) {
10987 +               desc->a = 0;
10988 +               desc->b = 0;
10989 +       } else {
10990 +               desc->a = LDT_entry_a(&info);
10991 +               desc->b = LDT_entry_b(&info);
10992 +       }
10993 +       load_TLS(t, cpu);
10994 +
10995 +       put_cpu();
10996 +
10997 +       return 0;
10998 +}
10999 +
11000 +/*
11001 + * Get the current Thread-Local Storage area:
11002 + */
11003 +
11004 +#define GET_BASE(desc) ( \
11005 +       (((desc)->a >> 16) & 0x0000ffff) | \
11006 +       (((desc)->b << 16) & 0x00ff0000) | \
11007 +       ( (desc)->b        & 0xff000000)   )
11008 +
11009 +#define GET_LIMIT(desc) ( \
11010 +       ((desc)->a & 0x0ffff) | \
11011 +        ((desc)->b & 0xf0000) )
11012 +       
11013 +#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
11014 +#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
11015 +#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
11016 +#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
11017 +#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
11018 +#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
11019 +
11020 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
11021 +{
11022 +       struct user_desc info;
11023 +       struct desc_struct *desc;
11024 +       int idx;
11025 +
11026 +       if (get_user(idx, &u_info->entry_number))
11027 +               return -EFAULT;
11028 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
11029 +               return -EINVAL;
11030 +
11031 +       memset(&info, 0, sizeof(info));
11032 +
11033 +       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
11034 +
11035 +       info.entry_number = idx;
11036 +       info.base_addr = GET_BASE(desc);
11037 +       info.limit = GET_LIMIT(desc);
11038 +       info.seg_32bit = GET_32BIT(desc);
11039 +       info.contents = GET_CONTENTS(desc);
11040 +       info.read_exec_only = !GET_WRITABLE(desc);
11041 +       info.limit_in_pages = GET_LIMIT_PAGES(desc);
11042 +       info.seg_not_present = !GET_PRESENT(desc);
11043 +       info.useable = GET_USEABLE(desc);
11044 +
11045 +       if (copy_to_user(u_info, &info, sizeof(info)))
11046 +               return -EFAULT;
11047 +       return 0;
11048 +}
11049 +
11050 +unsigned long arch_align_stack(unsigned long sp)
11051 +{
11052 +       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
11053 +               sp -= get_random_int() % 8192;
11054 +       return sp & ~0xf;
11055 +}
11056 diff -ruNp linux-2.6.19/arch/i386/kernel/quirks-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/quirks-xen.c
11057 --- linux-2.6.19/arch/i386/kernel/quirks-xen.c  1970-01-01 00:00:00.000000000 +0000
11058 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/quirks-xen.c        2007-02-02 19:10:21.000000000 +0000
11059 @@ -0,0 +1,47 @@
11060 +/*
11061 + * This file contains work-arounds for x86 and x86_64 platform bugs.
11062 + */
11063 +#include <linux/pci.h>
11064 +#include <linux/irq.h>
11065 +
11066 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
11067 +
11068 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
11069 +{
11070 +       u8 config, rev;
11071 +       u32 word;
11072 +
11073 +       /* BIOS may enable hardware IRQ balancing for
11074 +        * E7520/E7320/E7525(revision ID 0x9 and below)
11075 +        * based platforms.
11076 +        * Disable SW irqbalance/affinity on those platforms.
11077 +        */
11078 +       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
11079 +       if (rev > 0x9)
11080 +               return;
11081 +
11082 +       printk(KERN_INFO "Intel E7520/7320/7525 detected.");
11083 +
11084 +       /* enable access to config space*/
11085 +       pci_read_config_byte(dev, 0xf4, &config);
11086 +       pci_write_config_byte(dev, 0xf4, config|0x2);
11087 +
11088 +       /* read xTPR register */
11089 +       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
11090 +
11091 +       if (!(word & (1 << 13))) {
11092 +               dom0_op_t op;
11093 +               printk(KERN_INFO "Disabling irq balancing and affinity\n");
11094 +               op.cmd = DOM0_PLATFORM_QUIRK;
11095 +               op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
11096 +               (void)HYPERVISOR_dom0_op(&op);
11097 +       }
11098 +
11099 +       /* put back the original value for config space*/
11100 +       if (!(config & 0x2))
11101 +               pci_write_config_byte(dev, 0xf4, config);
11102 +}
11103 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
11104 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
11105 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
11106 +#endif
11107 diff -ruNp linux-2.6.19/arch/i386/kernel/setup-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/setup-xen.c
11108 --- linux-2.6.19/arch/i386/kernel/setup-xen.c   1970-01-01 00:00:00.000000000 +0000
11109 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/setup-xen.c 2007-02-02 19:10:21.000000000 +0000
11110 @@ -0,0 +1,1748 @@
11111 +/*
11112 + *  linux/arch/i386/kernel/setup.c
11113 + *
11114 + *  Copyright (C) 1995  Linus Torvalds
11115 + *
11116 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11117 + *
11118 + *  Memory region support
11119 + *     David Parsons <orc@pell.chi.il.us>, July-August 1999
11120 + *
11121 + *  Added E820 sanitization routine (removes overlapping memory regions);
11122 + *  Brian Moyle <bmoyle@mvista.com>, February 2001
11123 + *
11124 + * Moved CPU detection code to cpu/${cpu}.c
11125 + *    Patrick Mochel <mochel@osdl.org>, March 2002
11126 + *
11127 + *  Provisions for empty E820 memory regions (reported by certain BIOSes).
11128 + *  Alex Achenbach <xela@slit.de>, December 2002.
11129 + *
11130 + */
11131 +
11132 +/*
11133 + * This file handles the architecture-dependent parts of initialization
11134 + */
11135 +
11136 +#include <linux/sched.h>
11137 +#include <linux/mm.h>
11138 +#include <linux/mmzone.h>
11139 +#include <linux/screen_info.h>
11140 +#include <linux/ioport.h>
11141 +#include <linux/acpi.h>
11142 +#include <linux/apm_bios.h>
11143 +#include <linux/initrd.h>
11144 +#include <linux/bootmem.h>
11145 +#include <linux/seq_file.h>
11146 +#include <linux/platform_device.h>
11147 +#include <linux/console.h>
11148 +#include <linux/mca.h>
11149 +#include <linux/root_dev.h>
11150 +#include <linux/highmem.h>
11151 +#include <linux/module.h>
11152 +#include <linux/efi.h>
11153 +#include <linux/init.h>
11154 +#include <linux/edd.h>
11155 +#include <linux/nodemask.h>
11156 +#include <linux/notifier.h>
11157 +#include <linux/kexec.h>
11158 +#include <linux/crash_dump.h>
11159 +#include <linux/dmi.h>
11160 +#include <linux/pfn.h>
11161 +
11162 +#include <video/edid.h>
11163 +
11164 +#include <asm/apic.h>
11165 +#include <asm/e820.h>
11166 +#include <asm/mpspec.h>
11167 +#include <asm/mmzone.h>
11168 +#include <asm/setup.h>
11169 +#include <asm/arch_hooks.h>
11170 +#include <asm/sections.h>
11171 +#include <asm/io_apic.h>
11172 +#include <asm/ist.h>
11173 +#include <asm/io.h>
11174 +#include <asm/hypervisor.h>
11175 +#include <xen/interface/physdev.h>
11176 +#include <xen/interface/memory.h>
11177 +#include <xen/features.h>
11178 +#include <xen/xencons.h>
11179 +#include <setup_arch.h>
11180 +#include <bios_ebda.h>
11181 +
11182 +#ifdef CONFIG_XEN
11183 +#include <xen/interface/kexec.h>
11184 +#endif
11185 +
11186 +/* Forward Declaration. */
11187 +void __init find_max_pfn(void);
11188 +
11189 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11190 +static struct notifier_block xen_panic_block = {
11191 +       xen_panic_event, NULL, 0 /* try to go last */
11192 +};
11193 +
11194 +extern char hypercall_page[PAGE_SIZE];
11195 +EXPORT_SYMBOL(hypercall_page);
11196 +
11197 +int disable_pse __devinitdata = 0;
11198 +
11199 +/*
11200 + * Machine setup..
11201 + */
11202 +
11203 +#ifdef CONFIG_EFI
11204 +int efi_enabled = 0;
11205 +EXPORT_SYMBOL(efi_enabled);
11206 +#endif
11207 +
11208 +/* cpu data as detected by the assembly code in head.S */
11209 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
11210 +/* common cpu data for all cpus */
11211 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
11212 +EXPORT_SYMBOL(boot_cpu_data);
11213 +
11214 +unsigned long mmu_cr4_features;
11215 +
11216 +/* for MCA, but anyone else can use it if they want */
11217 +unsigned int machine_id;
11218 +#ifdef CONFIG_MCA
11219 +EXPORT_SYMBOL(machine_id);
11220 +#endif
11221 +unsigned int machine_submodel_id;
11222 +unsigned int BIOS_revision;
11223 +unsigned int mca_pentium_flag;
11224 +
11225 +/* For PCI or other memory-mapped resources */
11226 +unsigned long pci_mem_start = 0x10000000;
11227 +#ifdef CONFIG_PCI
11228 +EXPORT_SYMBOL(pci_mem_start);
11229 +#endif
11230 +
11231 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11232 +int bootloader_type;
11233 +
11234 +/* user-defined highmem size */
11235 +static unsigned int highmem_pages = -1;
11236 +
11237 +/*
11238 + * Setup options
11239 + */
11240 +struct drive_info_struct { char dummy[32]; } drive_info;
11241 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
11242 +    defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
11243 +EXPORT_SYMBOL(drive_info);
11244 +#endif
11245 +struct screen_info screen_info;
11246 +EXPORT_SYMBOL(screen_info);
11247 +struct apm_info apm_info;
11248 +EXPORT_SYMBOL(apm_info);
11249 +struct sys_desc_table_struct {
11250 +       unsigned short length;
11251 +       unsigned char table[0];
11252 +};
11253 +struct edid_info edid_info;
11254 +EXPORT_SYMBOL_GPL(edid_info);
11255 +struct ist_info ist_info;
11256 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
11257 +       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11258 +EXPORT_SYMBOL(ist_info);
11259 +#endif
11260 +struct e820map e820;
11261 +static void __init e820_setup_gap(struct e820entry *e820, int nr_map);
11262 +void __init add_memory_region(unsigned long long start,
11263 +                             unsigned long long size, int type);
11264 +#ifdef CONFIG_XEN
11265 +struct e820map machine_e820;
11266 +#endif
11267 +
11268 +extern void early_cpu_init(void);
11269 +extern int root_mountflags;
11270 +
11271 +unsigned long saved_videomode;
11272 +
11273 +#define RAMDISK_IMAGE_START_MASK       0x07FF
11274 +#define RAMDISK_PROMPT_FLAG            0x8000
11275 +#define RAMDISK_LOAD_FLAG              0x4000  
11276 +
11277 +static char command_line[COMMAND_LINE_SIZE];
11278 +
11279 +unsigned char __initdata boot_params[PARAM_SIZE];
11280 +
11281 +static struct resource data_resource = {
11282 +       .name   = "Kernel data",
11283 +       .start  = 0,
11284 +       .end    = 0,
11285 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
11286 +};
11287 +
11288 +static struct resource code_resource = {
11289 +       .name   = "Kernel code",
11290 +       .start  = 0,
11291 +       .end    = 0,
11292 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
11293 +};
11294 +
11295 +static struct resource system_rom_resource = {
11296 +       .name   = "System ROM",
11297 +       .start  = 0xf0000,
11298 +       .end    = 0xfffff,
11299 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11300 +};
11301 +
11302 +static struct resource extension_rom_resource = {
11303 +       .name   = "Extension ROM",
11304 +       .start  = 0xe0000,
11305 +       .end    = 0xeffff,
11306 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11307 +};
11308 +
11309 +static struct resource adapter_rom_resources[] = { {
11310 +       .name   = "Adapter ROM",
11311 +       .start  = 0xc8000,
11312 +       .end    = 0,
11313 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11314 +}, {
11315 +       .name   = "Adapter ROM",
11316 +       .start  = 0,
11317 +       .end    = 0,
11318 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11319 +}, {
11320 +       .name   = "Adapter ROM",
11321 +       .start  = 0,
11322 +       .end    = 0,
11323 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11324 +}, {
11325 +       .name   = "Adapter ROM",
11326 +       .start  = 0,
11327 +       .end    = 0,
11328 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11329 +}, {
11330 +       .name   = "Adapter ROM",
11331 +       .start  = 0,
11332 +       .end    = 0,
11333 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11334 +}, {
11335 +       .name   = "Adapter ROM",
11336 +       .start  = 0,
11337 +       .end    = 0,
11338 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11339 +} };
11340 +
11341 +static struct resource video_rom_resource = {
11342 +       .name   = "Video ROM",
11343 +       .start  = 0xc0000,
11344 +       .end    = 0xc7fff,
11345 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11346 +};
11347 +
11348 +static struct resource video_ram_resource = {
11349 +       .name   = "Video RAM area",
11350 +       .start  = 0xa0000,
11351 +       .end    = 0xbffff,
11352 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
11353 +};
11354 +
11355 +static struct resource standard_io_resources[] = { {
11356 +       .name   = "dma1",
11357 +       .start  = 0x0000,
11358 +       .end    = 0x001f,
11359 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11360 +}, {
11361 +       .name   = "pic1",
11362 +       .start  = 0x0020,
11363 +       .end    = 0x0021,
11364 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11365 +}, {
11366 +       .name   = "timer0",
11367 +       .start  = 0x0040,
11368 +       .end    = 0x0043,
11369 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11370 +}, {
11371 +       .name   = "timer1",
11372 +       .start  = 0x0050,
11373 +       .end    = 0x0053,
11374 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11375 +}, {
11376 +       .name   = "keyboard",
11377 +       .start  = 0x0060,
11378 +       .end    = 0x006f,
11379 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11380 +}, {
11381 +       .name   = "dma page reg",
11382 +       .start  = 0x0080,
11383 +       .end    = 0x008f,
11384 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11385 +}, {
11386 +       .name   = "pic2",
11387 +       .start  = 0x00a0,
11388 +       .end    = 0x00a1,
11389 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11390 +}, {
11391 +       .name   = "dma2",
11392 +       .start  = 0x00c0,
11393 +       .end    = 0x00df,
11394 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11395 +}, {
11396 +       .name   = "fpu",
11397 +       .start  = 0x00f0,
11398 +       .end    = 0x00ff,
11399 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
11400 +} };
11401 +
11402 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
11403 +
11404 +static int __init romchecksum(unsigned char *rom, unsigned long length)
11405 +{
11406 +       unsigned char *p, sum = 0;
11407 +
11408 +       for (p = rom; p < rom + length; p++)
11409 +               sum += *p;
11410 +       return sum == 0;
11411 +}
11412 +
11413 +static void __init probe_roms(void)
11414 +{
11415 +       unsigned long start, length, upper;
11416 +       unsigned char *rom;
11417 +       int           i;
11418 +
11419 +#ifdef CONFIG_XEN
11420 +       /* Nothing to do if not running in dom0. */
11421 +       if (!is_initial_xendomain())
11422 +               return;
11423 +#endif
11424 +
11425 +       /* video rom */
11426 +       upper = adapter_rom_resources[0].start;
11427 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
11428 +               rom = isa_bus_to_virt(start);
11429 +               if (!romsignature(rom))
11430 +                       continue;
11431 +
11432 +               video_rom_resource.start = start;
11433 +
11434 +               /* 0 < length <= 0x7f * 512, historically */
11435 +               length = rom[2] * 512;
11436 +
11437 +               /* if checksum okay, trust length byte */
11438 +               if (length && romchecksum(rom, length))
11439 +                       video_rom_resource.end = start + length - 1;
11440 +
11441 +               request_resource(&iomem_resource, &video_rom_resource);
11442 +               break;
11443 +       }
11444 +
11445 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
11446 +       if (start < upper)
11447 +               start = upper;
11448 +
11449 +       /* system rom */
11450 +       request_resource(&iomem_resource, &system_rom_resource);
11451 +       upper = system_rom_resource.start;
11452 +
11453 +       /* check for extension rom (ignore length byte!) */
11454 +       rom = isa_bus_to_virt(extension_rom_resource.start);
11455 +       if (romsignature(rom)) {
11456 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
11457 +               if (romchecksum(rom, length)) {
11458 +                       request_resource(&iomem_resource, &extension_rom_resource);
11459 +                       upper = extension_rom_resource.start;
11460 +               }
11461 +       }
11462 +
11463 +       /* check for adapter roms on 2k boundaries */
11464 +       for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
11465 +               rom = isa_bus_to_virt(start);
11466 +               if (!romsignature(rom))
11467 +                       continue;
11468 +
11469 +               /* 0 < length <= 0x7f * 512, historically */
11470 +               length = rom[2] * 512;
11471 +
11472 +               /* but accept any length that fits if checksum okay */
11473 +               if (!length || start + length > upper || !romchecksum(rom, length))
11474 +                       continue;
11475 +
11476 +               adapter_rom_resources[i].start = start;
11477 +               adapter_rom_resources[i].end = start + length - 1;
11478 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
11479 +
11480 +               start = adapter_rom_resources[i++].end & ~2047UL;
11481 +       }
11482 +}
11483 +
11484 +/*
11485 + * Point at the empty zero page to start with. We map the real shared_info
11486 + * page as soon as fixmap is up and running.
11487 + */
11488 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11489 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11490 +
11491 +unsigned long *phys_to_machine_mapping;
11492 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
11493 +EXPORT_SYMBOL(phys_to_machine_mapping);
11494 +
11495 +/* Raw start-of-day parameters from the hypervisor. */
11496 +start_info_t *xen_start_info;
11497 +EXPORT_SYMBOL(xen_start_info);
11498 +
11499 +static void __init limit_regions(unsigned long long size)
11500 +{
11501 +       unsigned long long current_addr = 0;
11502 +       int i;
11503 +
11504 +       if (efi_enabled) {
11505 +               efi_memory_desc_t *md;
11506 +               void *p;
11507 +
11508 +               for (p = memmap.map, i = 0; p < memmap.map_end;
11509 +                       p += memmap.desc_size, i++) {
11510 +                       md = p;
11511 +                       current_addr = md->phys_addr + (md->num_pages << 12);
11512 +                       if (md->type == EFI_CONVENTIONAL_MEMORY) {
11513 +                               if (current_addr >= size) {
11514 +                                       md->num_pages -=
11515 +                                               (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
11516 +                                       memmap.nr_map = i + 1;
11517 +                                       return;
11518 +                               }
11519 +                       }
11520 +               }
11521 +       }
11522 +       for (i = 0; i < e820.nr_map; i++) {
11523 +               current_addr = e820.map[i].addr + e820.map[i].size;
11524 +               if (current_addr < size)
11525 +                       continue;
11526 +
11527 +               if (e820.map[i].type != E820_RAM)
11528 +                       continue;
11529 +
11530 +               if (e820.map[i].addr >= size) {
11531 +                       /*
11532 +                        * This region starts past the end of the
11533 +                        * requested size, skip it completely.
11534 +                        */
11535 +                       e820.nr_map = i;
11536 +               } else {
11537 +                       e820.nr_map = i + 1;
11538 +                       e820.map[i].size -= current_addr - size;
11539 +               }
11540 +               return;
11541 +       }
11542 +#ifdef CONFIG_XEN
11543 +       if (i==e820.nr_map && current_addr < size) {
11544 +               /*
11545 +                 * The e820 map finished before our requested size so
11546 +                 * extend the final entry to the requested address.
11547 +                 */
11548 +               --i;
11549 +               if (e820.map[i].type == E820_RAM)
11550 +                       e820.map[i].size -= current_addr - size;
11551 +               else
11552 +                       add_memory_region(current_addr, size - current_addr, E820_RAM);
11553 +       }
11554 +#endif
11555 +}
11556 +
11557 +void __init add_memory_region(unsigned long long start,
11558 +                             unsigned long long size, int type)
11559 +{
11560 +       int x;
11561 +
11562 +       if (!efi_enabled) {
11563 +                       x = e820.nr_map;
11564 +
11565 +               if (x == E820MAX) {
11566 +                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
11567 +                   return;
11568 +               }
11569 +
11570 +               e820.map[x].addr = start;
11571 +               e820.map[x].size = size;
11572 +               e820.map[x].type = type;
11573 +               e820.nr_map++;
11574 +       }
11575 +} /* add_memory_region */
11576 +
11577 +#define E820_DEBUG     1
11578 +
11579 +static void __init print_memory_map(char *who)
11580 +{
11581 +       int i;
11582 +
11583 +       for (i = 0; i < e820.nr_map; i++) {
11584 +               printk(" %s: %016Lx - %016Lx ", who,
11585 +                       e820.map[i].addr,
11586 +                       e820.map[i].addr + e820.map[i].size);
11587 +               switch (e820.map[i].type) {
11588 +               case E820_RAM:  printk("(usable)\n");
11589 +                               break;
11590 +               case E820_RESERVED:
11591 +                               printk("(reserved)\n");
11592 +                               break;
11593 +               case E820_ACPI:
11594 +                               printk("(ACPI data)\n");
11595 +                               break;
11596 +               case E820_NVS:
11597 +                               printk("(ACPI NVS)\n");
11598 +                               break;
11599 +               default:        printk("type %lu\n", e820.map[i].type);
11600 +                               break;
11601 +               }
11602 +       }
11603 +}
11604 +
11605 +/*
11606 + * Sanitize the BIOS e820 map.
11607 + *
11608 + * Some e820 responses include overlapping entries.  The following 
11609 + * replaces the original e820 map with a new one, removing overlaps.
11610 + *
11611 + */
11612 +struct change_member {
11613 +       struct e820entry *pbios; /* pointer to original bios entry */
11614 +       unsigned long long addr; /* address for this change point */
11615 +};
11616 +static struct change_member change_point_list[2*E820MAX] __initdata;
11617 +static struct change_member *change_point[2*E820MAX] __initdata;
11618 +static struct e820entry *overlap_list[E820MAX] __initdata;
11619 +static struct e820entry new_bios[E820MAX] __initdata;
11620 +
11621 +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
11622 +{
11623 +       struct change_member *change_tmp;
11624 +       unsigned long current_type, last_type;
11625 +       unsigned long long last_addr;
11626 +       int chgidx, still_changing;
11627 +       int overlap_entries;
11628 +       int new_bios_entry;
11629 +       int old_nr, new_nr, chg_nr;
11630 +       int i;
11631 +
11632 +       /*
11633 +               Visually we're performing the following (1,2,3,4 = memory types)...
11634 +
11635 +               Sample memory map (w/overlaps):
11636 +                  ____22__________________
11637 +                  ______________________4_
11638 +                  ____1111________________
11639 +                  _44_____________________
11640 +                  11111111________________
11641 +                  ____________________33__
11642 +                  ___________44___________
11643 +                  __________33333_________
11644 +                  ______________22________
11645 +                  ___________________2222_
11646 +                  _________111111111______
11647 +                  _____________________11_
11648 +                  _________________4______
11649 +
11650 +               Sanitized equivalent (no overlap):
11651 +                  1_______________________
11652 +                  _44_____________________
11653 +                  ___1____________________
11654 +                  ____22__________________
11655 +                  ______11________________
11656 +                  _________1______________
11657 +                  __________3_____________
11658 +                  ___________44___________
11659 +                  _____________33_________
11660 +                  _______________2________
11661 +                  ________________1_______
11662 +                  _________________4______
11663 +                  ___________________2____
11664 +                  ____________________33__
11665 +                  ______________________4_
11666 +       */
11667 +
11668 +       /* if there's only one memory region, don't bother */
11669 +       if (*pnr_map < 2)
11670 +               return -1;
11671 +
11672 +       old_nr = *pnr_map;
11673 +
11674 +       /* bail out if we find any unreasonable addresses in bios map */
11675 +       for (i=0; i<old_nr; i++)
11676 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
11677 +                       return -1;
11678 +
11679 +       /* create pointers for initial change-point information (for sorting) */
11680 +       for (i=0; i < 2*old_nr; i++)
11681 +               change_point[i] = &change_point_list[i];
11682 +
11683 +       /* record all known change-points (starting and ending addresses),
11684 +          omitting those that are for empty memory regions */
11685 +       chgidx = 0;
11686 +       for (i=0; i < old_nr; i++)      {
11687 +               if (biosmap[i].size != 0) {
11688 +                       change_point[chgidx]->addr = biosmap[i].addr;
11689 +                       change_point[chgidx++]->pbios = &biosmap[i];
11690 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
11691 +                       change_point[chgidx++]->pbios = &biosmap[i];
11692 +               }
11693 +       }
11694 +       chg_nr = chgidx;        /* true number of change-points */
11695 +
11696 +       /* sort change-point list by memory addresses (low -> high) */
11697 +       still_changing = 1;
11698 +       while (still_changing)  {
11699 +               still_changing = 0;
11700 +               for (i=1; i < chg_nr; i++)  {
11701 +                       /* if <current_addr> > <last_addr>, swap */
11702 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
11703 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
11704 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
11705 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
11706 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
11707 +                          )
11708 +                       {
11709 +                               change_tmp = change_point[i];
11710 +                               change_point[i] = change_point[i-1];
11711 +                               change_point[i-1] = change_tmp;
11712 +                               still_changing=1;
11713 +                       }
11714 +               }
11715 +       }
11716 +
11717 +       /* create a new bios memory map, removing overlaps */
11718 +       overlap_entries=0;       /* number of entries in the overlap table */
11719 +       new_bios_entry=0;        /* index for creating new bios map entries */
11720 +       last_type = 0;           /* start with undefined memory type */
11721 +       last_addr = 0;           /* start with 0 as last starting address */
11722 +       /* loop through change-points, determining affect on the new bios map */
11723 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
11724 +       {
11725 +               /* keep track of all overlapping bios entries */
11726 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
11727 +               {
11728 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
11729 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
11730 +               }
11731 +               else
11732 +               {
11733 +                       /* remove entry from list (order independent, so swap with last) */
11734 +                       for (i=0; i<overlap_entries; i++)
11735 +                       {
11736 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
11737 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
11738 +                       }
11739 +                       overlap_entries--;
11740 +               }
11741 +               /* if there are overlapping entries, decide which "type" to use */
11742 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
11743 +               current_type = 0;
11744 +               for (i=0; i<overlap_entries; i++)
11745 +                       if (overlap_list[i]->type > current_type)
11746 +                               current_type = overlap_list[i]->type;
11747 +               /* continue building up new bios map based on this information */
11748 +               if (current_type != last_type)  {
11749 +                       if (last_type != 0)      {
11750 +                               new_bios[new_bios_entry].size =
11751 +                                       change_point[chgidx]->addr - last_addr;
11752 +                               /* move forward only if the new size was non-zero */
11753 +                               if (new_bios[new_bios_entry].size != 0)
11754 +                                       if (++new_bios_entry >= E820MAX)
11755 +                                               break;  /* no more space left for new bios entries */
11756 +                       }
11757 +                       if (current_type != 0)  {
11758 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
11759 +                               new_bios[new_bios_entry].type = current_type;
11760 +                               last_addr=change_point[chgidx]->addr;
11761 +                       }
11762 +                       last_type = current_type;
11763 +               }
11764 +       }
11765 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
11766 +
11767 +       /* copy new bios mapping into original location */
11768 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
11769 +       *pnr_map = new_nr;
11770 +
11771 +       return 0;
11772 +}
11773 +
11774 +/*
11775 + * Copy the BIOS e820 map into a safe place.
11776 + *
11777 + * Sanity-check it while we're at it..
11778 + *
11779 + * If we're lucky and live on a modern system, the setup code
11780 + * will have given us a memory map that we can use to properly
11781 + * set up memory.  If we aren't, we'll fake a memory map.
11782 + *
11783 + * We check to see that the memory map contains at least 2 elements
11784 + * before we'll use it, because the detection code in setup.S may
11785 + * not be perfect and most every PC known to man has two memory
11786 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
11787 + * thinkpad 560x, for example, does not cooperate with the memory
11788 + * detection code.)
11789 + */
11790 +int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
11791 +{
11792 +#ifndef CONFIG_XEN
11793 +       /* Only one memory region (or negative)? Ignore it */
11794 +       if (nr_map < 2)
11795 +               return -1;
11796 +#else
11797 +       BUG_ON(nr_map < 1);
11798 +#endif
11799 +
11800 +       do {
11801 +               unsigned long long start = biosmap->addr;
11802 +               unsigned long long size = biosmap->size;
11803 +               unsigned long long end = start + size;
11804 +               unsigned long type = biosmap->type;
11805 +
11806 +               /* Overflow in 64 bits? Ignore the memory map. */
11807 +               if (start > end)
11808 +                       return -1;
11809 +
11810 +#ifndef CONFIG_XEN
11811 +               /*
11812 +                * Some BIOSes claim RAM in the 640k - 1M region.
11813 +                * Not right. Fix it up.
11814 +                */
11815 +               if (type == E820_RAM) {
11816 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
11817 +                               if (start < 0xA0000ULL)
11818 +                                       add_memory_region(start, 0xA0000ULL-start, type);
11819 +                               if (end <= 0x100000ULL)
11820 +                                       continue;
11821 +                               start = 0x100000ULL;
11822 +                               size = end - start;
11823 +                       }
11824 +               }
11825 +#endif
11826 +               add_memory_region(start, size, type);
11827 +       } while (biosmap++,--nr_map);
11828 +       return 0;
11829 +}
11830 +
11831 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11832 +struct edd edd;
11833 +#ifdef CONFIG_EDD_MODULE
11834 +EXPORT_SYMBOL(edd);
11835 +#endif
11836 +/**
11837 + * copy_edd() - Copy the BIOS EDD information
11838 + *              from boot_params into a safe place.
11839 + *
11840 + */
11841 +static inline void copy_edd(void)
11842 +{
11843 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
11844 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
11845 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
11846 +     edd.edd_info_nr = EDD_NR;
11847 +}
11848 +#else
11849 +static inline void copy_edd(void)
11850 +{
11851 +}
11852 +#endif
11853 +
11854 +static int __initdata user_defined_memmap = 0;
11855 +
11856 +/*
11857 + * "mem=nopentium" disables the 4MB page tables.
11858 + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
11859 + * to <mem>, overriding the bios size.
11860 + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
11861 + * <start> to <start>+<mem>, overriding the bios size.
11862 + *
11863 + * HPA tells me bootloaders need to parse mem=, so no new
11864 + * option should be mem=  [also see Documentation/i386/boot.txt]
11865 + */
11866 +static int __init parse_mem(char *arg)
11867 +{
11868 +       if (!arg)
11869 +               return -EINVAL;
11870 +
11871 +       if (strcmp(arg, "nopentium") == 0) {
11872 +               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
11873 +               disable_pse = 1;
11874 +       } else {
11875 +               /* If the user specifies memory size, we
11876 +                * limit the BIOS-provided memory map to
11877 +                * that size. exactmap can be used to specify
11878 +                * the exact map. mem=number can be used to
11879 +                * trim the existing memory map.
11880 +                */
11881 +               unsigned long long mem_size;
11882
11883 +               mem_size = memparse(arg, &arg);
11884 +               limit_regions(mem_size);
11885 +               user_defined_memmap = 1;
11886 +       }
11887 +       return 0;
11888 +}
11889 +early_param("mem", parse_mem);
11890 +
11891 +static int __init parse_memmap(char *arg)
11892 +{
11893 +       if (!arg)
11894 +               return -EINVAL;
11895 +
11896 +       if (strcmp(arg, "exactmap") == 0) {
11897 +#ifdef CONFIG_CRASH_DUMP
11898 +               /* If we are doing a crash dump, we
11899 +                * still need to know the real mem
11900 +                * size before original memory map is
11901 +                * reset.
11902 +                */
11903 +               find_max_pfn();
11904 +               saved_max_pfn = max_pfn;
11905 +#endif
11906 +               e820.nr_map = 0;
11907 +               user_defined_memmap = 1;
11908 +       } else {
11909 +               /* If the user specifies memory size, we
11910 +                * limit the BIOS-provided memory map to
11911 +                * that size. exactmap can be used to specify
11912 +                * the exact map. mem=number can be used to
11913 +                * trim the existing memory map.
11914 +                */
11915 +               unsigned long long start_at, mem_size;
11916 +
11917 +               mem_size = memparse(arg, &arg);
11918 +               if (*arg == '@') {
11919 +                       start_at = memparse(arg+1, &arg);
11920 +                       add_memory_region(start_at, mem_size, E820_RAM);
11921 +               } else if (*arg == '#') {
11922 +                       start_at = memparse(arg+1, &arg);
11923 +                       add_memory_region(start_at, mem_size, E820_ACPI);
11924 +               } else if (*arg == '$') {
11925 +                       start_at = memparse(arg+1, &arg);
11926 +                       add_memory_region(start_at, mem_size, E820_RESERVED);
11927 +               } else {
11928 +                       limit_regions(mem_size);
11929 +                       user_defined_memmap = 1;
11930 +               }
11931 +       }
11932 +       return 0;
11933 +}
11934 +early_param("memmap", parse_memmap);
11935 +
11936 +#ifdef CONFIG_PROC_VMCORE
11937 +/* elfcorehdr= specifies the location of elf core header
11938 + * stored by the crashed kernel.
11939 + */
11940 +static int __init parse_elfcorehdr(char *arg)
11941 +{
11942 +       if (!arg)
11943 +               return -EINVAL;
11944 +
11945 +       elfcorehdr_addr = memparse(arg, &arg);
11946 +       return 0;
11947 +}
11948 +early_param("elfcorehdr", parse_elfcorehdr);
11949 +#endif /* CONFIG_PROC_VMCORE */
11950 +
11951 +/*
11952 + * highmem=size forces highmem to be exactly 'size' bytes.
11953 + * This works even on boxes that have no highmem otherwise.
11954 + * This also works to reduce highmem size on bigger boxes.
11955 + */
11956 +static int __init parse_highmem(char *arg)
11957 +{
11958 +       if (!arg)
11959 +               return -EINVAL;
11960 +
11961 +       highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
11962 +       return 0;
11963 +}
11964 +early_param("highmem", parse_highmem);
11965 +
11966 +/*
11967 + * vmalloc=size forces the vmalloc area to be exactly 'size'
11968 + * bytes. This can be used to increase (or decrease) the
11969 + * vmalloc area - the default is 128m.
11970 + */
11971 +static int __init parse_vmalloc(char *arg)
11972 +{
11973 +       if (!arg)
11974 +               return -EINVAL;
11975 +
11976 +       __VMALLOC_RESERVE = memparse(arg, &arg);
11977 +       return 0;
11978 +}
11979 +early_param("vmalloc", parse_vmalloc);
11980 +
11981 +/*
11982 + * reservetop=size reserves a hole at the top of the kernel address space which
11983 + * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
11984 + * so relocating the fixmap can be done before paging initialization.
11985 + */
11986 +static int __init parse_reservetop(char *arg)
11987 +{
11988 +       unsigned long address;
11989 +
11990 +       if (!arg)
11991 +               return -EINVAL;
11992 +
11993 +       address = memparse(arg, &arg);
11994 +       reserve_top_address(address);
11995 +       return 0;
11996 +}
11997 +early_param("reservetop", parse_reservetop);
11998 +
11999 +/*
12000 + * Callback for efi_memory_walk.
12001 + */
12002 +static int __init
12003 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
12004 +{
12005 +       unsigned long *max_pfn = arg, pfn;
12006 +
12007 +       if (start < end) {
12008 +               pfn = PFN_UP(end -1);
12009 +               if (pfn > *max_pfn)
12010 +                       *max_pfn = pfn;
12011 +       }
12012 +       return 0;
12013 +}
12014 +
12015 +static int __init
12016 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
12017 +{
12018 +       memory_present(0, PFN_UP(start), PFN_DOWN(end));
12019 +       return 0;
12020 +}
12021 +
12022 + /*
12023 +  * This function checks if the entire range <start,end> is mapped with type.
12024 +  *
12025 +  * Note: this function only works correct if the e820 table is sorted and
12026 +  * not-overlapping, which is the case
12027 +  */
12028 +int __init
12029 +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
12030 +{
12031 +       u64 start = s;
12032 +       u64 end = e;
12033 +       int i;
12034 +       for (i = 0; i < e820.nr_map; i++) {
12035 +               struct e820entry *ei = &e820.map[i];
12036 +               if (type && ei->type != type)
12037 +                       continue;
12038 +               /* is the region (part) in overlap with the current region ?*/
12039 +               if (ei->addr >= end || ei->addr + ei->size <= start)
12040 +                       continue;
12041 +               /* if the region is at the beginning of <start,end> we move
12042 +                * start to the end of the region since it's ok until there
12043 +                */
12044 +               if (ei->addr <= start)
12045 +                       start = ei->addr + ei->size;
12046 +               /* if start is now at or beyond end, we're done, full
12047 +                * coverage */
12048 +               if (start >= end)
12049 +                       return 1; /* we're done */
12050 +       }
12051 +       return 0;
12052 +}
12053 +
12054 +/*
12055 + * Find the highest page frame number we have available
12056 + */
12057 +void __init find_max_pfn(void)
12058 +{
12059 +       int i;
12060 +
12061 +       max_pfn = 0;
12062 +       if (efi_enabled) {
12063 +               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
12064 +               efi_memmap_walk(efi_memory_present_wrapper, NULL);
12065 +               return;
12066 +       }
12067 +
12068 +       for (i = 0; i < e820.nr_map; i++) {
12069 +               unsigned long start, end;
12070 +               /* RAM? */
12071 +               if (e820.map[i].type != E820_RAM)
12072 +                       continue;
12073 +               start = PFN_UP(e820.map[i].addr);
12074 +               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
12075 +               if (start >= end)
12076 +                       continue;
12077 +               if (end > max_pfn)
12078 +                       max_pfn = end;
12079 +               memory_present(0, start, end);
12080 +       }
12081 +}
12082 +
12083 +/*
12084 + * Determine low and high memory ranges:
12085 + */
12086 +unsigned long __init find_max_low_pfn(void)
12087 +{
12088 +       unsigned long max_low_pfn;
12089 +
12090 +       max_low_pfn = max_pfn;
12091 +       if (max_low_pfn > MAXMEM_PFN) {
12092 +               if (highmem_pages == -1)
12093 +                       highmem_pages = max_pfn - MAXMEM_PFN;
12094 +               if (highmem_pages + MAXMEM_PFN < max_pfn)
12095 +                       max_pfn = MAXMEM_PFN + highmem_pages;
12096 +               if (highmem_pages + MAXMEM_PFN > max_pfn) {
12097 +                       printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
12098 +                       highmem_pages = 0;
12099 +               }
12100 +               max_low_pfn = MAXMEM_PFN;
12101 +#ifndef CONFIG_HIGHMEM
12102 +               /* Maximum memory usable is what is directly addressable */
12103 +               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
12104 +                                       MAXMEM>>20);
12105 +               if (max_pfn > MAX_NONPAE_PFN)
12106 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
12107 +               else
12108 +                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
12109 +               max_pfn = MAXMEM_PFN;
12110 +#else /* !CONFIG_HIGHMEM */
12111 +#ifndef CONFIG_X86_PAE
12112 +               if (max_pfn > MAX_NONPAE_PFN) {
12113 +                       max_pfn = MAX_NONPAE_PFN;
12114 +                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
12115 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
12116 +               }
12117 +#endif /* !CONFIG_X86_PAE */
12118 +#endif /* !CONFIG_HIGHMEM */
12119 +       } else {
12120 +               if (highmem_pages == -1)
12121 +                       highmem_pages = 0;
12122 +#ifdef CONFIG_HIGHMEM
12123 +               if (highmem_pages >= max_pfn) {
12124 +                       printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
12125 +                       highmem_pages = 0;
12126 +               }
12127 +               if (highmem_pages) {
12128 +                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
12129 +                               printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
12130 +                               highmem_pages = 0;
12131 +                       }
12132 +                       max_low_pfn -= highmem_pages;
12133 +               }
12134 +#else
12135 +               if (highmem_pages)
12136 +                       printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
12137 +#endif
12138 +       }
12139 +       return max_low_pfn;
12140 +}
12141 +
12142 +/*
12143 + * Free all available memory for boot time allocation.  Used
12144 + * as a callback function by efi_memory_walk()
12145 + */
12146 +
12147 +static int __init
12148 +free_available_memory(unsigned long start, unsigned long end, void *arg)
12149 +{
12150 +       /* check max_low_pfn */
12151 +       if (start >= (max_low_pfn << PAGE_SHIFT))
12152 +               return 0;
12153 +       if (end >= (max_low_pfn << PAGE_SHIFT))
12154 +               end = max_low_pfn << PAGE_SHIFT;
12155 +       if (start < end)
12156 +               free_bootmem(start, end - start);
12157 +
12158 +       return 0;
12159 +}
12160 +/*
12161 + * Register fully available low RAM pages with the bootmem allocator.
12162 + */
12163 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
12164 +{
12165 +       int i;
12166 +
12167 +       if (efi_enabled) {
12168 +               efi_memmap_walk(free_available_memory, NULL);
12169 +               return;
12170 +       }
12171 +       for (i = 0; i < e820.nr_map; i++) {
12172 +               unsigned long curr_pfn, last_pfn, size;
12173 +               /*
12174 +                * Reserve usable low memory
12175 +                */
12176 +               if (e820.map[i].type != E820_RAM)
12177 +                       continue;
12178 +               /*
12179 +                * We are rounding up the start address of usable memory:
12180 +                */
12181 +               curr_pfn = PFN_UP(e820.map[i].addr);
12182 +               if (curr_pfn >= max_low_pfn)
12183 +                       continue;
12184 +               /*
12185 +                * ... and at the end of the usable range downwards:
12186 +                */
12187 +               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
12188 +
12189 +#ifdef CONFIG_XEN
12190 +               /*
12191 +                 * Truncate to the number of actual pages currently
12192 +                 * present.
12193 +                 */
12194 +               if (last_pfn > xen_start_info->nr_pages)
12195 +                       last_pfn = xen_start_info->nr_pages;
12196 +#endif
12197 +
12198 +               if (last_pfn > max_low_pfn)
12199 +                       last_pfn = max_low_pfn;
12200 +
12201 +               /*
12202 +                * .. finally, did all the rounding and playing
12203 +                * around just make the area go away?
12204 +                */
12205 +               if (last_pfn <= curr_pfn)
12206 +                       continue;
12207 +
12208 +               size = last_pfn - curr_pfn;
12209 +               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
12210 +       }
12211 +}
12212 +
12213 +#ifndef CONFIG_XEN
12214 +/*
12215 + * workaround for Dell systems that neglect to reserve EBDA
12216 + */
12217 +static void __init reserve_ebda_region(void)
12218 +{
12219 +       unsigned int addr;
12220 +       addr = get_bios_ebda();
12221 +       if (addr)
12222 +               reserve_bootmem(addr, PAGE_SIZE);       
12223 +}
12224 +#endif
12225 +
12226 +#ifndef CONFIG_NEED_MULTIPLE_NODES
12227 +void __init setup_bootmem_allocator(void);
12228 +static unsigned long __init setup_memory(void)
12229 +{
12230 +       /*
12231 +        * partially used pages are not usable - thus
12232 +        * we are rounding upwards:
12233 +        */
12234 +       min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
12235 +               xen_start_info->nr_pt_frames;
12236 +
12237 +       find_max_pfn();
12238 +
12239 +       max_low_pfn = find_max_low_pfn();
12240 +
12241 +#ifdef CONFIG_HIGHMEM
12242 +       highstart_pfn = highend_pfn = max_pfn;
12243 +       if (max_pfn > max_low_pfn) {
12244 +               highstart_pfn = max_low_pfn;
12245 +       }
12246 +       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
12247 +               pages_to_mb(highend_pfn - highstart_pfn));
12248 +       num_physpages = highend_pfn;
12249 +       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
12250 +#else
12251 +       num_physpages = max_low_pfn;
12252 +       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
12253 +#endif
12254 +#ifdef CONFIG_FLATMEM
12255 +       max_mapnr = num_physpages;
12256 +#endif
12257 +       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
12258 +                       pages_to_mb(max_low_pfn));
12259 +
12260 +       setup_bootmem_allocator();
12261 +
12262 +       return max_low_pfn;
12263 +}
12264 +
12265 +void __init zone_sizes_init(void)
12266 +{
12267 +       /*
12268 +        * XEN: Our notion of "DMA memory" is fake when running over Xen.
12269 +        * We simply put all RAM in the DMA zone so that those drivers which
12270 +        * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
12271 +        * Those drivers that *do* require lowmem are screwed anyway when
12272 +        * running over Xen!
12273 +        */
12274 +       unsigned long max_zone_pfns[MAX_NR_ZONES];
12275 +       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
12276 +       max_zone_pfns[ZONE_DMA] = max_low_pfn;
12277 +       max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
12278 +#ifdef CONFIG_HIGHMEM
12279 +       max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
12280 +       add_active_range(0, 0, highend_pfn);
12281 +#else
12282 +       add_active_range(0, 0, max_low_pfn);
12283 +#endif
12284 +
12285 +       free_area_init_nodes(max_zone_pfns);
12286 +}
12287 +#else
12288 +extern unsigned long __init setup_memory(void);
12289 +extern void zone_sizes_init(void);
12290 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
12291 +
12292 +void __init setup_bootmem_allocator(void)
12293 +{
12294 +       unsigned long bootmap_size;
12295 +       /*
12296 +        * Initialize the boot-time allocator (with low memory only):
12297 +        */
12298 +       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
12299 +
12300 +       register_bootmem_low_pages(max_low_pfn);
12301 +
12302 +       /*
12303 +        * Reserve the bootmem bitmap itself as well. We do this in two
12304 +        * steps (first step was init_bootmem()) because this catches
12305 +        * the (very unlikely) case of us accidentally initializing the
12306 +        * bootmem allocator with an invalid RAM area.
12307 +        */
12308 +       reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
12309 +                        bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
12310 +
12311 +#ifndef CONFIG_XEN
12312 +       /*
12313 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
12314 +        * enabling clean reboots, SMP operation, laptop functions.
12315 +        */
12316 +       reserve_bootmem(0, PAGE_SIZE);
12317 +
12318 +       /* reserve EBDA region, it's a 4K region */
12319 +       reserve_ebda_region();
12320 +
12321 +    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
12322 +       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
12323 +       unless you have no PS/2 mouse plugged in. */
12324 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
12325 +           boot_cpu_data.x86 == 6)
12326 +            reserve_bootmem(0xa0000 - 4096, 4096);
12327 +
12328 +#ifdef CONFIG_SMP
12329 +       /*
12330 +        * But first pinch a few for the stack/trampoline stuff
12331 +        * FIXME: Don't need the extra page at 4K, but need to fix
12332 +        * trampoline before removing it. (see the GDT stuff)
12333 +        */
12334 +       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
12335 +#endif
12336 +#ifdef CONFIG_ACPI_SLEEP
12337 +       /*
12338 +        * Reserve low memory region for sleep support.
12339 +        */
12340 +       acpi_reserve_bootmem();
12341 +#endif
12342 +#endif /* !CONFIG_XEN */
12343 +
12344 +#ifdef CONFIG_BLK_DEV_INITRD
12345 +       if (xen_start_info->mod_start) {
12346 +               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
12347 +                       /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
12348 +                       initrd_start = INITRD_START + PAGE_OFFSET;
12349 +                       initrd_end = initrd_start+INITRD_SIZE;
12350 +                       initrd_below_start_ok = 1;
12351 +               }
12352 +               else {
12353 +                       printk(KERN_ERR "initrd extends beyond end of memory "
12354 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
12355 +                           INITRD_START + INITRD_SIZE,
12356 +                           max_low_pfn << PAGE_SHIFT);
12357 +                       initrd_start = 0;
12358 +               }
12359 +       }
12360 +#endif
12361 +#ifdef CONFIG_KEXEC
12362 +#ifdef CONFIG_XEN
12363 +       xen_machine_kexec_setup_resources();
12364 +#else
12365 +       if (crashk_res.start != crashk_res.end)
12366 +               reserve_bootmem(crashk_res.start,
12367 +                       crashk_res.end - crashk_res.start + 1);
12368 +#endif
12369 +#endif
12370 +
12371 +       if (!xen_feature(XENFEAT_auto_translated_physmap))
12372 +               phys_to_machine_mapping =
12373 +                       (unsigned long *)xen_start_info->mfn_list;
12374 +}
12375 +
12376 +/*
12377 + * The node 0 pgdat is initialized before all of these because
12378 + * it's needed for bootmem.  node>0 pgdats have their virtual
12379 + * space allocated before the pagetables are in place to access
12380 + * them, so they can't be cleared then.
12381 + *
12382 + * This should all compile down to nothing when NUMA is off.
12383 + */
12384 +void __init remapped_pgdat_init(void)
12385 +{
12386 +       int nid;
12387 +
12388 +       for_each_online_node(nid) {
12389 +               if (nid != 0)
12390 +                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
12391 +       }
12392 +}
12393 +
12394 +/*
12395 + * Request address space for all standard RAM and ROM resources
12396 + * and also for regions reported as reserved by the e820.
12397 + */
12398 +static void __init
12399 +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
12400 +{
12401 +       int i;
12402 +       struct e820entry *map = e820.map;
12403 +       int nr_map = e820.nr_map;
12404 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
12405 +       struct xen_memory_map memmap;
12406 +
12407 +       map = machine_e820.map;
12408 +       memmap.nr_entries = E820MAX;
12409 +
12410 +       set_xen_guest_handle(memmap.buffer, map);
12411 +
12412 +       if(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
12413 +               BUG();
12414 +       machine_e820.nr_map = memmap.nr_entries;
12415 +       nr_map = memmap.nr_entries;
12416 +       e820_setup_gap(map, memmap.nr_entries);
12417 +#endif
12418 +
12419 +       probe_roms();
12420 +       for (i = 0; i < nr_map; i++) {
12421 +               struct resource *res;
12422 +#ifndef CONFIG_RESOURCES_64BIT
12423 +               if (map[i].addr + map[i].size > 0x100000000ULL)
12424 +                       continue;
12425 +#endif
12426 +               res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
12427 +               switch (map[i].type) {
12428 +               case E820_RAM:  res->name = "System RAM"; break;
12429 +               case E820_ACPI: res->name = "ACPI Tables"; break;
12430 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
12431 +               default:        res->name = "reserved";
12432 +               }
12433 +               res->start = map[i].addr;
12434 +               res->end = res->start + map[i].size - 1;
12435 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
12436 +               if (request_resource(&iomem_resource, res)) {
12437 +                       kfree(res);
12438 +                       continue;
12439 +               }
12440 +               if (map[i].type == E820_RAM) {
12441 +                       /*
12442 +                        *  We don't know which RAM region contains kernel data,
12443 +                        *  so we try it repeatedly and let the resource manager
12444 +                        *  test it.
12445 +                        */
12446 +#ifndef CONFIG_XEN
12447 +                       request_resource(res, code_resource);
12448 +                       request_resource(res, data_resource);
12449 +#endif
12450 +#ifdef CONFIG_KEXEC
12451 +                       if (crashk_res.start != crashk_res.end)
12452 +                            request_resource(res, &crashk_res);
12453 +#ifdef CONFIG_XEN
12454 +                       xen_machine_kexec_register_resources(res);
12455 +#endif
12456 +#endif
12457 +               }
12458 +       }
12459 +}
12460 +
12461 +/*
12462 + * Request address space for all standard resources
12463 + *
12464 + * This is called just before pcibios_init(), which is also a
12465 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
12466 + */
12467 +static int __init request_standard_resources(void)
12468 +{
12469 +       int i;
12470 +
12471 +       /* Nothing to do if not running in dom0. */
12472 +       if (!is_initial_xendomain())
12473 +               return 0;
12474 +
12475 +       printk("Setting up standard PCI resources\n");
12476 +       if (efi_enabled)
12477 +               efi_initialize_iomem_resources(&code_resource, &data_resource);
12478 +       else
12479 +               legacy_init_iomem_resources(&code_resource, &data_resource);
12480 +
12481 +       /* EFI systems may still have VGA */
12482 +       request_resource(&iomem_resource, &video_ram_resource);
12483 +
12484 +       /* request I/O space for devices used on all i[345]86 PCs */
12485 +       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
12486 +               request_resource(&ioport_resource, &standard_io_resources[i]);
12487 +       return 0;
12488 +}
12489 +
12490 +subsys_initcall(request_standard_resources);
12491 +
12492 +/*
12493 + * Locate a unused range of the physical address space below 4G which
12494 + * can be used for PCI mappings.
12495 + */
12496 +static void __init
12497 +e820_setup_gap(struct e820entry *e820, int nr_map)
12498 +{
12499 +       unsigned long gapstart, gapsize, round;
12500 +       unsigned long long last;
12501 +       int i;
12502 +
12503 +       /*
12504 +        * Search for the bigest gap in the low 32 bits of the e820
12505 +        * memory space.
12506 +        */
12507 +       last = 0x100000000ull;
12508 +       gapstart = 0x10000000;
12509 +       gapsize = 0x400000;
12510 +       i = nr_map;
12511 +       while (--i >= 0) {
12512 +               unsigned long long start = e820[i].addr;
12513 +               unsigned long long end = start + e820[i].size;
12514 +
12515 +               /*
12516 +                * Since "last" is at most 4GB, we know we'll
12517 +                * fit in 32 bits if this condition is true
12518 +                */
12519 +               if (last > end) {
12520 +                       unsigned long gap = last - end;
12521 +
12522 +                       if (gap > gapsize) {
12523 +                               gapsize = gap;
12524 +                               gapstart = end;
12525 +                       }
12526 +               }
12527 +               if (start < last)
12528 +                       last = start;
12529 +       }
12530 +
12531 +       /*
12532 +        * See how much we want to round up: start off with
12533 +        * rounding to the next 1MB area.
12534 +        */
12535 +       round = 0x100000;
12536 +       while ((gapsize >> 4) > round)
12537 +               round += round;
12538 +       /* Fun with two's complement */
12539 +       pci_mem_start = (gapstart + round) & -round;
12540 +
12541 +       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
12542 +               pci_mem_start, gapstart, gapsize);
12543 +}
12544 +
12545 +static void __init register_memory(void)
12546 +{
12547 +#ifndef CONFIG_XEN
12548 +       e820_setup_gap(e820.map, e820.nr_map);
12549 +#endif
12550 +}
12551 +
12552 +#ifdef CONFIG_MCA
12553 +static void set_mca_bus(int x)
12554 +{
12555 +       MCA_bus = x;
12556 +}
12557 +#else
12558 +static void set_mca_bus(int x) { }
12559 +#endif
12560 +
12561 +/*
12562 + * Determine if we were loaded by an EFI loader.  If so, then we have also been
12563 + * passed the efi memmap, systab, etc., so we should use these data structures
12564 + * for initialization.  Note, the efi init code path is determined by the
12565 + * global efi_enabled. This allows the same kernel image to be used on existing
12566 + * systems (with a traditional BIOS) as well as on EFI systems.
12567 + */
12568 +void __init setup_arch(char **cmdline_p)
12569 +{
12570 +       int i, j, k, fpp;
12571 +       struct physdev_set_iopl set_iopl;
12572 +       unsigned long max_low_pfn;
12573 +
12574 +       /* Force a quick death if the kernel panics (not domain 0). */
12575 +       extern int panic_timeout;
12576 +       if (!panic_timeout && !is_initial_xendomain())
12577 +               panic_timeout = 1;
12578 +
12579 +       /* Register a call for panic conditions. */
12580 +       atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12581 +
12582 +       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
12583 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
12584 +                            VMASST_TYPE_writable_pagetables);
12585 +
12586 +       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12587 +       pre_setup_arch_hook();
12588 +       early_cpu_init();
12589 +
12590 +       /*
12591 +        * FIXME: This isn't an official loader_type right
12592 +        * now but does currently work with elilo.
12593 +        * If we were configured as an EFI kernel, check to make
12594 +        * sure that we were loaded correctly from elilo and that
12595 +        * the system table is valid.  If not, then initialize normally.
12596 +        */
12597 +#ifdef CONFIG_EFI
12598 +       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
12599 +               efi_enabled = 1;
12600 +#endif
12601 +
12602 +       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12603 +          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12604 +       */
12605 +       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12606 +       drive_info = DRIVE_INFO;
12607 +       screen_info = SCREEN_INFO;
12608 +       edid_info = EDID_INFO;
12609 +       apm_info.bios = APM_BIOS_INFO;
12610 +       ist_info = IST_INFO;
12611 +       saved_videomode = VIDEO_MODE;
12612 +       if( SYS_DESC_TABLE.length != 0 ) {
12613 +               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
12614 +               machine_id = SYS_DESC_TABLE.table[0];
12615 +               machine_submodel_id = SYS_DESC_TABLE.table[1];
12616 +               BIOS_revision = SYS_DESC_TABLE.table[2];
12617 +       }
12618 +       bootloader_type = LOADER_TYPE;
12619 +
12620 +       if (is_initial_xendomain()) {
12621 +               /* This is drawn from a dump from vgacon:startup in
12622 +                * standard Linux. */
12623 +               screen_info.orig_video_mode = 3; 
12624 +               screen_info.orig_video_isVGA = 1;
12625 +               screen_info.orig_video_lines = 25;
12626 +               screen_info.orig_video_cols = 80;
12627 +               screen_info.orig_video_ega_bx = 3;
12628 +               screen_info.orig_video_points = 16;
12629 +               screen_info.orig_y = screen_info.orig_video_lines - 1;
12630 +               if (xen_start_info->console.dom0.info_size >=
12631 +                   sizeof(struct dom0_vga_console_info)) {
12632 +                       const struct dom0_vga_console_info *info =
12633 +                               (struct dom0_vga_console_info *)(
12634 +                                       (char *)xen_start_info +
12635 +                                       xen_start_info->console.dom0.info_off);
12636 +                       dom0_init_screen_info(info);
12637 +               }
12638 +               xen_start_info->console.domU.mfn = 0;
12639 +               xen_start_info->console.domU.evtchn = 0;
12640 +       } else
12641 +               screen_info.orig_video_isVGA = 0;
12642 +
12643 +#ifdef CONFIG_BLK_DEV_RAM
12644 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
12645 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
12646 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
12647 +#endif
12648 +
12649 +       setup_xen_features();
12650 +
12651 +       ARCH_SETUP
12652 +       if (efi_enabled)
12653 +               efi_init();
12654 +       else {
12655 +               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
12656 +               print_memory_map(machine_specific_memory_setup());
12657 +       }
12658 +
12659 +       copy_edd();
12660 +
12661 +       if (!MOUNT_ROOT_RDONLY)
12662 +               root_mountflags &= ~MS_RDONLY;
12663 +       init_mm.start_code = (unsigned long) _text;
12664 +       init_mm.end_code = (unsigned long) _etext;
12665 +       init_mm.end_data = (unsigned long) _edata;
12666 +       init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12667 +                      xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12668 +
12669 +       code_resource.start = virt_to_phys(_text);
12670 +       code_resource.end = virt_to_phys(_etext)-1;
12671 +       data_resource.start = virt_to_phys(_etext);
12672 +       data_resource.end = virt_to_phys(_edata)-1;
12673 +
12674 +       parse_early_param();
12675 +
12676 +       if (user_defined_memmap) {
12677 +               printk(KERN_INFO "user-defined physical RAM map:\n");
12678 +               print_memory_map("user");
12679 +       }
12680 +
12681 +       strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
12682 +       *cmdline_p = command_line;
12683 +
12684 +       max_low_pfn = setup_memory();
12685 +
12686 +       /*
12687 +        * NOTE: before this point _nobody_ is allowed to allocate
12688 +        * any memory using the bootmem allocator.  Although the
12689 +        * alloctor is now initialised only the first 8Mb of the kernel
12690 +        * virtual address space has been mapped.  All allocations before
12691 +        * paging_init() has completed must use the alloc_bootmem_low_pages()
12692 +        * variant (which allocates DMA'able memory) and care must be taken
12693 +        * not to exceed the 8Mb limit.
12694 +        */
12695 +
12696 +#ifdef CONFIG_SMP
12697 +       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
12698 +#endif
12699 +       paging_init();
12700 +       remapped_pgdat_init();
12701 +       sparse_init();
12702 +       zone_sizes_init();
12703 +
12704 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12705 +       /*
12706 +        * Find and reserve possible boot-time SMP configuration:
12707 +        */
12708 +       find_smp_config();
12709 +#endif
12710 +       numa_kva_reserve();
12711 +
12712 +       /* Make sure we have a correctly sized P->M table. */
12713 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12714 +               phys_to_machine_mapping = alloc_bootmem_low_pages(
12715 +                    max_pfn * sizeof(unsigned long));
12716 +               memset(phys_to_machine_mapping, ~0,
12717 +                      max_pfn * sizeof(unsigned long));
12718 +               memcpy(phys_to_machine_mapping,
12719 +                      (unsigned long *)xen_start_info->mfn_list,
12720 +                      xen_start_info->nr_pages * sizeof(unsigned long));
12721 +               free_bootmem(
12722 +                    __pa(xen_start_info->mfn_list),
12723 +                    PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12724 +                                    sizeof(unsigned long))));
12725 +
12726 +               /*
12727 +                * Initialise the list of the frames that specify the list of
12728 +                * frames that make up the p2m table. Used by save/restore
12729 +                */
12730 +               pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
12731 +               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12732 +                    virt_to_mfn(pfn_to_mfn_frame_list_list);
12733 +
12734 +               fpp = PAGE_SIZE/sizeof(unsigned long);
12735 +               for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
12736 +                       if ((j % fpp) == 0) {
12737 +                               k++;
12738 +                               BUG_ON(k>=16);
12739 +                               pfn_to_mfn_frame_list[k] =
12740 +                                       alloc_bootmem_low_pages(PAGE_SIZE);
12741 +                               pfn_to_mfn_frame_list_list[k] =
12742 +                                       virt_to_mfn(pfn_to_mfn_frame_list[k]);
12743 +                               j=0;
12744 +                       }
12745 +                       pfn_to_mfn_frame_list[k][j] =
12746 +                               virt_to_mfn(&phys_to_machine_mapping[i]);
12747 +               }
12748 +               HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12749 +       }
12750 +
12751 +       /*
12752 +        * NOTE: at this point the bootmem allocator is fully available.
12753 +        */
12754 +
12755 +       if (is_initial_xendomain())
12756 +               dmi_scan_machine();
12757 +
12758 +#ifdef CONFIG_X86_GENERICARCH
12759 +       generic_apic_probe();
12760 +#endif 
12761 +       if (efi_enabled)
12762 +               efi_map_memmap();
12763 +
12764 +       set_iopl.iopl = 1;
12765 +       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
12766 +
12767 +#ifdef CONFIG_ACPI
12768 +       if (!is_initial_xendomain()) {
12769 +               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12770 +               acpi_disabled = 1;
12771 +               acpi_ht = 0;
12772 +       }
12773 +
12774 +       /*
12775 +        * Parse the ACPI tables for possible boot-time SMP configuration.
12776 +        */
12777 +       acpi_boot_table_init();
12778 +#endif
12779 +
12780 +#ifdef CONFIG_PCI
12781 +#ifdef CONFIG_X86_IO_APIC
12782 +       check_acpi_pci();       /* Checks more than just ACPI actually */
12783 +#endif
12784 +#endif
12785 +
12786 +#ifdef CONFIG_ACPI
12787 +       acpi_boot_init();
12788 +
12789 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
12790 +       if (def_to_bigsmp)
12791 +               printk(KERN_WARNING "More than 8 CPUs detected and "
12792 +                       "CONFIG_X86_PC cannot handle it.\nUse "
12793 +                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
12794 +#endif
12795 +#endif
12796 +#ifdef CONFIG_X86_LOCAL_APIC
12797 +       if (smp_found_config)
12798 +               get_smp_config();
12799 +#endif
12800 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
12801 +       prefill_possible_map();
12802 +#endif
12803 +
12804 +       register_memory();
12805 +
12806 +       if (is_initial_xendomain()) {
12807 +#ifdef CONFIG_VT
12808 +#if defined(CONFIG_VGA_CONSOLE)
12809 +               if (!efi_enabled ||
12810 +                   (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12811 +                       conswitchp = &vga_con;
12812 +#elif defined(CONFIG_DUMMY_CONSOLE)
12813 +               conswitchp = &dummy_con;
12814 +#endif
12815 +#endif
12816 +       } else {
12817 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
12818 +               conswitchp = &dummy_con;
12819 +#endif
12820 +       }
12821 +       xencons_early_setup();
12822 +#ifdef CONFIG_X86_TSC
12823 +       tsc_init();
12824 +#endif
12825 +}
12826 +
12827 +static int
12828 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12829 +{
12830 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
12831 +       /* we're never actually going to get here... */
12832 +       return NOTIFY_DONE;
12833 +}
12834 +
12835 +static __init int add_pcspkr(void)
12836 +{
12837 +       struct platform_device *pd;
12838 +       int ret;
12839 +
12840 +       pd = platform_device_alloc("pcspkr", -1);
12841 +       if (!pd)
12842 +               return -ENOMEM;
12843 +
12844 +       ret = platform_device_add(pd);
12845 +       if (ret)
12846 +               platform_device_put(pd);
12847 +
12848 +       return ret;
12849 +}
12850 +device_initcall(add_pcspkr);
12851 +
12852 +/*
12853 + * Local Variables:
12854 + * mode:c
12855 + * c-file-style:"k&r"
12856 + * c-basic-offset:8
12857 + * End:
12858 + */
12859 diff -ruNp linux-2.6.19/arch/i386/kernel/smp-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/smp-xen.c
12860 --- linux-2.6.19/arch/i386/kernel/smp-xen.c     1970-01-01 00:00:00.000000000 +0000
12861 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/smp-xen.c   2007-02-02 19:10:21.000000000 +0000
12862 @@ -0,0 +1,635 @@
12863 +/*
12864 + *     Intel SMP support routines.
12865 + *
12866 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
12867 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
12868 + *
12869 + *     This code is released under the GNU General Public License version 2 or
12870 + *     later.
12871 + */
12872 +
12873 +#include <linux/init.h>
12874 +
12875 +#include <linux/mm.h>
12876 +#include <linux/delay.h>
12877 +#include <linux/spinlock.h>
12878 +#include <linux/smp_lock.h>
12879 +#include <linux/kernel_stat.h>
12880 +#include <linux/mc146818rtc.h>
12881 +#include <linux/cache.h>
12882 +#include <linux/interrupt.h>
12883 +#include <linux/cpu.h>
12884 +#include <linux/module.h>
12885 +
12886 +#include <asm/mtrr.h>
12887 +#include <asm/tlbflush.h>
12888 +#if 0
12889 +#include <mach_apic.h>
12890 +#endif
12891 +#include <xen/evtchn.h>
12892 +
12893 +/*
12894 + *     Some notes on x86 processor bugs affecting SMP operation:
12895 + *
12896 + *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
12897 + *     The Linux implications for SMP are handled as follows:
12898 + *
12899 + *     Pentium III / [Xeon]
12900 + *             None of the E1AP-E3AP errata are visible to the user.
12901 + *
12902 + *     E1AP.   see PII A1AP
12903 + *     E2AP.   see PII A2AP
12904 + *     E3AP.   see PII A3AP
12905 + *
12906 + *     Pentium II / [Xeon]
12907 + *             None of the A1AP-A3AP errata are visible to the user.
12908 + *
12909 + *     A1AP.   see PPro 1AP
12910 + *     A2AP.   see PPro 2AP
12911 + *     A3AP.   see PPro 7AP
12912 + *
12913 + *     Pentium Pro
12914 + *             None of 1AP-9AP errata are visible to the normal user,
12915 + *     except occasional delivery of 'spurious interrupt' as trap #15.
12916 + *     This is very rare and a non-problem.
12917 + *
12918 + *     1AP.    Linux maps APIC as non-cacheable
12919 + *     2AP.    worked around in hardware
12920 + *     3AP.    fixed in C0 and above steppings microcode update.
12921 + *             Linux does not use excessive STARTUP_IPIs.
12922 + *     4AP.    worked around in hardware
12923 + *     5AP.    symmetric IO mode (normal Linux operation) not affected.
12924 + *             'noapic' mode has vector 0xf filled out properly.
12925 + *     6AP.    'noapic' mode might be affected - fixed in later steppings
12926 + *     7AP.    We do not assume writes to the LVT deassering IRQs
12927 + *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
12928 + *     9AP.    We do not use mixed mode
12929 + *
12930 + *     Pentium
12931 + *             There is a marginal case where REP MOVS on 100MHz SMP
12932 + *     machines with B stepping processors can fail. XXX should provide
12933 + *     an L1cache=Writethrough or L1cache=off option.
12934 + *
12935 + *             B stepping CPUs may hang. There are hardware work arounds
12936 + *     for this. We warn about it in case your board doesn't have the work
12937 + *     arounds. Basically thats so I can tell anyone with a B stepping
12938 + *     CPU and SMP problems "tough".
12939 + *
12940 + *     Specific items [From Pentium Processor Specification Update]
12941 + *
12942 + *     1AP.    Linux doesn't use remote read
12943 + *     2AP.    Linux doesn't trust APIC errors
12944 + *     3AP.    We work around this
12945 + *     4AP.    Linux never generated 3 interrupts of the same priority
12946 + *             to cause a lost local interrupt.
12947 + *     5AP.    Remote read is never used
12948 + *     6AP.    not affected - worked around in hardware
12949 + *     7AP.    not affected - worked around in hardware
12950 + *     8AP.    worked around in hardware - we get explicit CS errors if not
12951 + *     9AP.    only 'noapic' mode affected. Might generate spurious
12952 + *             interrupts, we log only the first one and count the
12953 + *             rest silently.
12954 + *     10AP.   not affected - worked around in hardware
12955 + *     11AP.   Linux reads the APIC between writes to avoid this, as per
12956 + *             the documentation. Make sure you preserve this as it affects
12957 + *             the C stepping chips too.
12958 + *     12AP.   not affected - worked around in hardware
12959 + *     13AP.   not affected - worked around in hardware
12960 + *     14AP.   we always deassert INIT during bootup
12961 + *     15AP.   not affected - worked around in hardware
12962 + *     16AP.   not affected - worked around in hardware
12963 + *     17AP.   not affected - worked around in hardware
12964 + *     18AP.   not affected - worked around in hardware
12965 + *     19AP.   not affected - worked around in BIOS
12966 + *
12967 + *     If this sounds worrying believe me these bugs are either ___RARE___,
12968 + *     or are signal timing bugs worked around in hardware and there's
12969 + *     about nothing of note with C stepping upwards.
12970 + */
12971 +
12972 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
12973 +
12974 +/*
12975 + * the following functions deal with sending IPIs between CPUs.
12976 + *
12977 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
12978 + */
12979 +
12980 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
12981 +{
12982 +       unsigned int icr = shortcut | APIC_DEST_LOGICAL;
12983 +
12984 +       switch (vector) {
12985 +       default:
12986 +               icr |= APIC_DM_FIXED | vector;
12987 +               break;
12988 +       case NMI_VECTOR:
12989 +               icr |= APIC_DM_NMI;
12990 +               break;
12991 +       }
12992 +       return icr;
12993 +}
12994 +
12995 +static inline int __prepare_ICR2 (unsigned int mask)
12996 +{
12997 +       return SET_APIC_DEST_FIELD(mask);
12998 +}
12999 +
13000 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
13001 +
13002 +static inline void __send_IPI_one(unsigned int cpu, int vector)
13003 +{
13004 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
13005 +       BUG_ON(irq < 0);
13006 +       notify_remote_via_irq(irq);
13007 +}
13008 +
13009 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
13010 +{
13011 +       int cpu;
13012 +
13013 +       switch (shortcut) {
13014 +       case APIC_DEST_SELF:
13015 +               __send_IPI_one(smp_processor_id(), vector);
13016 +               break;
13017 +       case APIC_DEST_ALLBUT:
13018 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
13019 +                       if (cpu == smp_processor_id())
13020 +                               continue;
13021 +                       if (cpu_isset(cpu, cpu_online_map)) {
13022 +                               __send_IPI_one(cpu, vector);
13023 +                       }
13024 +               }
13025 +               break;
13026 +       default:
13027 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
13028 +                      vector);
13029 +               break;
13030 +       }
13031 +}
13032 +
13033 +void fastcall send_IPI_self(int vector)
13034 +{
13035 +       __send_IPI_shortcut(APIC_DEST_SELF, vector);
13036 +}
13037 +
13038 +/*
13039 + * This is only used on smaller machines.
13040 + */
13041 +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
13042 +{
13043 +       unsigned long mask = cpus_addr(cpumask)[0];
13044 +       unsigned long flags;
13045 +       unsigned int cpu;
13046 +
13047 +       local_irq_save(flags);
13048 +       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
13049 +
13050 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
13051 +               if (cpu_isset(cpu, cpumask)) {
13052 +                       __send_IPI_one(cpu, vector);
13053 +               }
13054 +       }
13055 +
13056 +       local_irq_restore(flags);
13057 +}
13058 +
13059 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
13060 +{
13061 +
13062 +       send_IPI_mask_bitmask(mask, vector);
13063 +}
13064 +
13065 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
13066 +
13067 +#if 0 /* XEN */
13068 +/*
13069 + *     Smarter SMP flushing macros. 
13070 + *             c/o Linus Torvalds.
13071 + *
13072 + *     These mean you can really definitely utterly forget about
13073 + *     writing to user space from interrupts. (Its not allowed anyway).
13074 + *
13075 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
13076 + */
13077 +
13078 +static cpumask_t flush_cpumask;
13079 +static struct mm_struct * flush_mm;
13080 +static unsigned long flush_va;
13081 +static DEFINE_SPINLOCK(tlbstate_lock);
13082 +#define FLUSH_ALL      0xffffffff
13083 +
13084 +/*
13085 + * We cannot call mmdrop() because we are in interrupt context, 
13086 + * instead update mm->cpu_vm_mask.
13087 + *
13088 + * We need to reload %cr3 since the page tables may be going
13089 + * away from under us..
13090 + */
13091 +static inline void leave_mm (unsigned long cpu)
13092 +{
13093 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
13094 +               BUG();
13095 +       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
13096 +       load_cr3(swapper_pg_dir);
13097 +}
13098 +
13099 +/*
13100 + *
13101 + * The flush IPI assumes that a thread switch happens in this order:
13102 + * [cpu0: the cpu that switches]
13103 + * 1) switch_mm() either 1a) or 1b)
13104 + * 1a) thread switch to a different mm
13105 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
13106 + *     Stop ipi delivery for the old mm. This is not synchronized with
13107 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
13108 + *     for the wrong mm, and in the worst case we perform a superflous
13109 + *     tlb flush.
13110 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
13111 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
13112 + *     was in lazy tlb mode.
13113 + * 1a3) update cpu_tlbstate[].active_mm
13114 + *     Now cpu0 accepts tlb flushes for the new mm.
13115 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
13116 + *     Now the other cpus will send tlb flush ipis.
13117 + * 1a4) change cr3.
13118 + * 1b) thread switch without mm change
13119 + *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
13120 + *     flush ipis.
13121 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
13122 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
13123 + *     Atomically set the bit [other cpus will start sending flush ipis],
13124 + *     and test the bit.
13125 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
13126 + * 2) switch %%esp, ie current
13127 + *
13128 + * The interrupt must handle 2 special cases:
13129 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
13130 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
13131 + *   runs in kernel space, the cpu could load tlb entries for user space
13132 + *   pages.
13133 + *
13134 + * The good news is that cpu_tlbstate is local to each cpu, no
13135 + * write/read ordering problems.
13136 + */
13137 +
13138 +/*
13139 + * TLB flush IPI:
13140 + *
13141 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
13142 + * 2) Leave the mm if we are in the lazy tlb mode.
13143 + */
13144 +
13145 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
13146 +                                    struct pt_regs *regs)
13147 +{
13148 +       struct pt_regs *old_regs = set_irq_regs(regs);
13149 +       unsigned long cpu;
13150 +
13151 +       cpu = get_cpu();
13152 +
13153 +       if (!cpu_isset(cpu, flush_cpumask))
13154 +               goto out;
13155 +               /* 
13156 +                * This was a BUG() but until someone can quote me the
13157 +                * line from the intel manual that guarantees an IPI to
13158 +                * multiple CPUs is retried _only_ on the erroring CPUs
13159 +                * its staying as a return
13160 +                *
13161 +                * BUG();
13162 +                */
13163 +                
13164 +       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
13165 +               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
13166 +                       if (flush_va == FLUSH_ALL)
13167 +                               local_flush_tlb();
13168 +                       else
13169 +                               __flush_tlb_one(flush_va);
13170 +               } else
13171 +                       leave_mm(cpu);
13172 +       }
13173 +       smp_mb__before_clear_bit();
13174 +       cpu_clear(cpu, flush_cpumask);
13175 +       smp_mb__after_clear_bit();
13176 +out:
13177 +       put_cpu_no_resched();
13178 +       set_irq_regs(old_regs);
13179 +
13180 +       return IRQ_HANDLED;
13181 +}
13182 +
13183 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
13184 +                                               unsigned long va)
13185 +{
13186 +       /*
13187 +        * A couple of (to be removed) sanity checks:
13188 +        *
13189 +        * - current CPU must not be in mask
13190 +        * - mask must exist :)
13191 +        */
13192 +       BUG_ON(cpus_empty(cpumask));
13193 +       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
13194 +       BUG_ON(!mm);
13195 +
13196 +       /* If a CPU which we ran on has gone down, OK. */
13197 +       cpus_and(cpumask, cpumask, cpu_online_map);
13198 +       if (cpus_empty(cpumask))
13199 +               return;
13200 +
13201 +       /*
13202 +        * i'm not happy about this global shared spinlock in the
13203 +        * MM hot path, but we'll see how contended it is.
13204 +        * Temporarily this turns IRQs off, so that lockups are
13205 +        * detected by the NMI watchdog.
13206 +        */
13207 +       spin_lock(&tlbstate_lock);
13208 +       
13209 +       flush_mm = mm;
13210 +       flush_va = va;
13211 +#if NR_CPUS <= BITS_PER_LONG
13212 +       atomic_set_mask(cpumask, &flush_cpumask);
13213 +#else
13214 +       {
13215 +               int k;
13216 +               unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
13217 +               unsigned long *cpu_mask = (unsigned long *)&cpumask;
13218 +               for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
13219 +                       atomic_set_mask(cpu_mask[k], &flush_mask[k]);
13220 +       }
13221 +#endif
13222 +       /*
13223 +        * We have to send the IPI only to
13224 +        * CPUs affected.
13225 +        */
13226 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
13227 +
13228 +       while (!cpus_empty(flush_cpumask))
13229 +               /* nothing. lockup detection does not belong here */
13230 +               mb();
13231 +
13232 +       flush_mm = NULL;
13233 +       flush_va = 0;
13234 +       spin_unlock(&tlbstate_lock);
13235 +}
13236 +       
13237 +void flush_tlb_current_task(void)
13238 +{
13239 +       struct mm_struct *mm = current->mm;
13240 +       cpumask_t cpu_mask;
13241 +
13242 +       preempt_disable();
13243 +       cpu_mask = mm->cpu_vm_mask;
13244 +       cpu_clear(smp_processor_id(), cpu_mask);
13245 +
13246 +       local_flush_tlb();
13247 +       if (!cpus_empty(cpu_mask))
13248 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
13249 +       preempt_enable();
13250 +}
13251 +
13252 +void flush_tlb_mm (struct mm_struct * mm)
13253 +{
13254 +       cpumask_t cpu_mask;
13255 +
13256 +       preempt_disable();
13257 +       cpu_mask = mm->cpu_vm_mask;
13258 +       cpu_clear(smp_processor_id(), cpu_mask);
13259 +
13260 +       if (current->active_mm == mm) {
13261 +               if (current->mm)
13262 +                       local_flush_tlb();
13263 +               else
13264 +                       leave_mm(smp_processor_id());
13265 +       }
13266 +       if (!cpus_empty(cpu_mask))
13267 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
13268 +
13269 +       preempt_enable();
13270 +}
13271 +
13272 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
13273 +{
13274 +       struct mm_struct *mm = vma->vm_mm;
13275 +       cpumask_t cpu_mask;
13276 +
13277 +       preempt_disable();
13278 +       cpu_mask = mm->cpu_vm_mask;
13279 +       cpu_clear(smp_processor_id(), cpu_mask);
13280 +
13281 +       if (current->active_mm == mm) {
13282 +               if(current->mm)
13283 +                       __flush_tlb_one(va);
13284 +                else
13285 +                       leave_mm(smp_processor_id());
13286 +       }
13287 +
13288 +       if (!cpus_empty(cpu_mask))
13289 +               flush_tlb_others(cpu_mask, mm, va);
13290 +
13291 +       preempt_enable();
13292 +}
13293 +EXPORT_SYMBOL(flush_tlb_page);
13294 +
13295 +static void do_flush_tlb_all(void* info)
13296 +{
13297 +       unsigned long cpu = smp_processor_id();
13298 +
13299 +       __flush_tlb_all();
13300 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
13301 +               leave_mm(cpu);
13302 +}
13303 +
13304 +void flush_tlb_all(void)
13305 +{
13306 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
13307 +}
13308 +
13309 +#else
13310 +
13311 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
13312 +                                    struct pt_regs *regs)
13313 +{ return 0; }
13314 +void flush_tlb_current_task(void)
13315 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
13316 +void flush_tlb_mm(struct mm_struct * mm)
13317 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
13318 +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
13319 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
13320 +EXPORT_SYMBOL(flush_tlb_page);
13321 +void flush_tlb_all(void)
13322 +{ xen_tlb_flush_all(); }
13323 +
13324 +#endif /* XEN */
13325 +
13326 +/*
13327 + * this function sends a 'reschedule' IPI to another CPU.
13328 + * it goes straight through and wastes no time serializing
13329 + * anything. Worst case is that we lose a reschedule ...
13330 + */
13331 +void smp_send_reschedule(int cpu)
13332 +{
13333 +       WARN_ON(cpu_is_offline(cpu));
13334 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
13335 +}
13336 +
13337 +/*
13338 + * Structure and data for smp_call_function(). This is designed to minimise
13339 + * static memory requirements. It also looks cleaner.
13340 + */
13341 +static DEFINE_SPINLOCK(call_lock);
13342 +
13343 +struct call_data_struct {
13344 +       void (*func) (void *info);
13345 +       void *info;
13346 +       atomic_t started;
13347 +       atomic_t finished;
13348 +       int wait;
13349 +};
13350 +
13351 +void lock_ipi_call_lock(void)
13352 +{
13353 +       spin_lock_irq(&call_lock);
13354 +}
13355 +
13356 +void unlock_ipi_call_lock(void)
13357 +{
13358 +       spin_unlock_irq(&call_lock);
13359 +}
13360 +
13361 +static struct call_data_struct *call_data;
13362 +
13363 +/**
13364 + * smp_call_function(): Run a function on all other CPUs.
13365 + * @func: The function to run. This must be fast and non-blocking.
13366 + * @info: An arbitrary pointer to pass to the function.
13367 + * @nonatomic: currently unused.
13368 + * @wait: If true, wait (atomically) until function has completed on other CPUs.
13369 + *
13370 + * Returns 0 on success, else a negative status code. Does not return until
13371 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
13372 + *
13373 + * You must not call this function with disabled interrupts or from a
13374 + * hardware interrupt handler or from a bottom half handler.
13375 + */
13376 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
13377 +                       int wait)
13378 +{
13379 +       struct call_data_struct data;
13380 +       int cpus;
13381 +
13382 +       /* Holding any lock stops cpus from going down. */
13383 +       spin_lock(&call_lock);
13384 +       cpus = num_online_cpus() - 1;
13385 +       if (!cpus) {
13386 +               spin_unlock(&call_lock);
13387 +               return 0;
13388 +       }
13389 +
13390 +       /* Can deadlock when called with interrupts disabled */
13391 +       WARN_ON(irqs_disabled());
13392 +
13393 +       data.func = func;
13394 +       data.info = info;
13395 +       atomic_set(&data.started, 0);
13396 +       data.wait = wait;
13397 +       if (wait)
13398 +               atomic_set(&data.finished, 0);
13399 +
13400 +       call_data = &data;
13401 +       mb();
13402 +       
13403 +       /* Send a message to all other CPUs and wait for them to respond */
13404 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
13405 +
13406 +       /* Wait for response */
13407 +       while (atomic_read(&data.started) != cpus)
13408 +               barrier();
13409 +
13410 +       if (wait)
13411 +               while (atomic_read(&data.finished) != cpus)
13412 +                       barrier();
13413 +       spin_unlock(&call_lock);
13414 +
13415 +       return 0;
13416 +}
13417 +EXPORT_SYMBOL(smp_call_function);
13418 +
13419 +static void stop_this_cpu (void * dummy)
13420 +{
13421 +       /*
13422 +        * Remove this CPU:
13423 +        */
13424 +       cpu_clear(smp_processor_id(), cpu_online_map);
13425 +       local_irq_disable();
13426 +#if 0
13427 +       disable_local_APIC();
13428 +#endif
13429 +       if (cpu_data[smp_processor_id()].hlt_works_ok)
13430 +               for(;;) halt();
13431 +       for (;;);
13432 +}
13433 +
13434 +/*
13435 + * this function calls the 'stop' function on all other CPUs in the system.
13436 + */
13437 +
13438 +void smp_send_stop(void)
13439 +{
13440 +       smp_call_function(stop_this_cpu, NULL, 1, 0);
13441 +
13442 +       local_irq_disable();
13443 +#if 0
13444 +       disable_local_APIC();
13445 +#endif
13446 +       local_irq_enable();
13447 +}
13448 +
13449 +/*
13450 + * Reschedule call back. Nothing to do,
13451 + * all the work is done automatically when
13452 + * we return from the interrupt.
13453 + */
13454 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
13455 +                                    struct pt_regs *regs)
13456 +{
13457 +       struct pt_regs *old_regs = set_irq_regs(regs);
13458 +       set_irq_regs(old_regs);
13459 +
13460 +       return IRQ_HANDLED;
13461 +}
13462 +
13463 +#include <linux/kallsyms.h>
13464 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
13465 +                                       struct pt_regs *regs)
13466 +{
13467 +       struct pt_regs *old_regs = set_irq_regs(regs);
13468 +       void (*func) (void *info) = call_data->func;
13469 +       void *info = call_data->info;
13470 +       int wait = call_data->wait;
13471 +
13472 +       /*
13473 +        * Notify initiating CPU that I've grabbed the data and am
13474 +        * about to execute the function
13475 +        */
13476 +       mb();
13477 +       atomic_inc(&call_data->started);
13478 +       /*
13479 +        * At this point the info structure may be out of scope unless wait==1
13480 +        */
13481 +       irq_enter();
13482 +       (*func)(info);
13483 +       irq_exit();
13484 +
13485 +       if (wait) {
13486 +               mb();
13487 +               atomic_inc(&call_data->finished);
13488 +       }
13489 +       set_irq_regs(old_regs);
13490 +
13491 +       return IRQ_HANDLED;
13492 +}
13493 +
13494 +int safe_smp_processor_id(void)
13495 +{
13496 +       return smp_processor_id();
13497 +}
13498 diff -ruNp linux-2.6.19/arch/i386/kernel/swiotlb.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/swiotlb.c
13499 --- linux-2.6.19/arch/i386/kernel/swiotlb.c     1970-01-01 00:00:00.000000000 +0000
13500 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/swiotlb.c   2007-02-02 19:10:21.000000000 +0000
13501 @@ -0,0 +1,683 @@
13502 +/*
13503 + * Dynamic DMA mapping support.
13504 + *
13505 + * This implementation is a fallback for platforms that do not support
13506 + * I/O TLBs (aka DMA address translation hardware).
13507 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
13508 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
13509 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
13510 + *     David Mosberger-Tang <davidm@hpl.hp.com>
13511 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
13512 + */
13513 +
13514 +#include <linux/cache.h>
13515 +#include <linux/mm.h>
13516 +#include <linux/module.h>
13517 +#include <linux/pci.h>
13518 +#include <linux/spinlock.h>
13519 +#include <linux/string.h>
13520 +#include <linux/types.h>
13521 +#include <linux/ctype.h>
13522 +#include <linux/init.h>
13523 +#include <linux/bootmem.h>
13524 +#include <linux/highmem.h>
13525 +#include <asm/io.h>
13526 +#include <asm/pci.h>
13527 +#include <asm/dma.h>
13528 +#include <asm/uaccess.h>
13529 +#include <xen/interface/memory.h>
13530 +
13531 +int swiotlb;
13532 +EXPORT_SYMBOL(swiotlb);
13533 +
13534 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
13535 +
13536 +#define SG_ENT_PHYS_ADDRESS(sg)        (page_to_bus((sg)->page) + (sg)->offset)
13537 +
13538 +/*
13539 + * Maximum allowable number of contiguous slabs to map,
13540 + * must be a power of 2.  What is the appropriate value ?
13541 + * The complexity of {map,unmap}_single is linearly dependent on this value.
13542 + */
13543 +#define IO_TLB_SEGSIZE 128
13544 +
13545 +/*
13546 + * log of the size of each IO TLB slab.  The number of slabs is command line
13547 + * controllable.
13548 + */
13549 +#define IO_TLB_SHIFT 11
13550 +
13551 +/* Width of DMA addresses. 30 bits is a b44 limitation. */
13552 +#define DEFAULT_DMA_BITS 30
13553 +
13554 +int swiotlb_force;
13555 +static char *iotlb_virt_start;
13556 +static unsigned long iotlb_nslabs;
13557 +
13558 +/*
13559 + * Used to do a quick range check in swiotlb_unmap_single and
13560 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
13561 + * API.
13562 + */
13563 +static unsigned long iotlb_pfn_start, iotlb_pfn_end;
13564 +
13565 +/* Does the given dma address reside within the swiotlb aperture? */
13566 +static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
13567 +{
13568 +       unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
13569 +       return (pfn_valid(pfn)
13570 +               && (pfn >= iotlb_pfn_start)
13571 +               && (pfn < iotlb_pfn_end));
13572 +}
13573 +
13574 +/*
13575 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
13576 + */
13577 +static unsigned long io_tlb_overflow = 32*1024;
13578 +
13579 +void *io_tlb_overflow_buffer;
13580 +
13581 +/*
13582 + * This is a free list describing the number of free entries available from
13583 + * each index
13584 + */
13585 +static unsigned int *io_tlb_list;
13586 +static unsigned int io_tlb_index;
13587 +
13588 +/*
13589 + * We need to save away the original address corresponding to a mapped entry
13590 + * for the sync operations.
13591 + */
13592 +static struct phys_addr {
13593 +       struct page *page;
13594 +       unsigned int offset;
13595 +} *io_tlb_orig_addr;
13596 +
13597 +/*
13598 + * Protect the above data structures in the map and unmap calls
13599 + */
13600 +static DEFINE_SPINLOCK(io_tlb_lock);
13601 +
13602 +unsigned int dma_bits = DEFAULT_DMA_BITS;
13603 +static int __init
13604 +setup_dma_bits(char *str)
13605 +{
13606 +       dma_bits = simple_strtoul(str, NULL, 0);
13607 +       return 0;
13608 +}
13609 +__setup("dma_bits=", setup_dma_bits);
13610 +
13611 +static int __init
13612 +setup_io_tlb_npages(char *str)
13613 +{
13614 +       /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
13615 +       if (isdigit(*str)) {
13616 +               iotlb_nslabs = simple_strtoul(str, &str, 0) <<
13617 +                       (20 - IO_TLB_SHIFT);
13618 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
13619 +               /* Round up to power of two (xen_create_contiguous_region). */
13620 +               while (iotlb_nslabs & (iotlb_nslabs-1))
13621 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
13622 +       }
13623 +       if (*str == ',')
13624 +               ++str;
13625 +       /*
13626 +         * NB. 'force' enables the swiotlb, but doesn't force its use for
13627 +         * every DMA like it does on native Linux. 'off' forcibly disables
13628 +         * use of the swiotlb.
13629 +         */
13630 +       if (!strcmp(str, "force"))
13631 +               swiotlb_force = 1;
13632 +       else if (!strcmp(str, "off"))
13633 +               swiotlb_force = -1;
13634 +       return 1;
13635 +}
13636 +__setup("swiotlb=", setup_io_tlb_npages);
13637 +/* make io_tlb_overflow tunable too? */
13638 +
13639 +/*
13640 + * Statically reserve bounce buffer space and initialize bounce buffer data
13641 + * structures for the software IO TLB used to implement the PCI DMA API.
13642 + */
13643 +void
13644 +swiotlb_init_with_default_size (size_t default_size)
13645 +{
13646 +       unsigned long i, bytes;
13647 +
13648 +       if (!iotlb_nslabs) {
13649 +               iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
13650 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
13651 +               /* Round up to power of two (xen_create_contiguous_region). */
13652 +               while (iotlb_nslabs & (iotlb_nslabs-1))
13653 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
13654 +       }
13655 +
13656 +       bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
13657 +
13658 +       /*
13659 +        * Get IO TLB memory from the low pages
13660 +        */
13661 +       iotlb_virt_start = alloc_bootmem_low_pages(bytes);
13662 +       if (!iotlb_virt_start)
13663 +               panic("Cannot allocate SWIOTLB buffer!\n"
13664 +                     "Use dom0_mem Xen boot parameter to reserve\n"
13665 +                     "some DMA memory (e.g., dom0_mem=-128M).\n");
13666 +
13667 +       for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
13668 +               int rc = xen_create_contiguous_region(
13669 +                       (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
13670 +                       get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
13671 +                       dma_bits);
13672 +               BUG_ON(rc);
13673 +       }
13674 +
13675 +       /*
13676 +        * Allocate and initialize the free list array.  This array is used
13677 +        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
13678 +        */
13679 +       io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
13680 +       for (i = 0; i < iotlb_nslabs; i++)
13681 +               io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
13682 +       io_tlb_index = 0;
13683 +       io_tlb_orig_addr = alloc_bootmem(
13684 +               iotlb_nslabs * sizeof(*io_tlb_orig_addr));
13685 +
13686 +       /*
13687 +        * Get the overflow emergency buffer
13688 +        */
13689 +       io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
13690 +
13691 +       iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
13692 +       iotlb_pfn_end   = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
13693 +
13694 +       printk(KERN_INFO "Software IO TLB enabled: \n"
13695 +              " Aperture:     %lu megabytes\n"
13696 +              " Kernel range: 0x%016lx - 0x%016lx\n"
13697 +              " Address size: %u bits\n",
13698 +              bytes >> 20,
13699 +              (unsigned long)iotlb_virt_start,
13700 +              (unsigned long)iotlb_virt_start + bytes,
13701 +              dma_bits);
13702 +}
13703 +
13704 +void
13705 +swiotlb_init(void)
13706 +{
13707 +       long ram_end;
13708 +       size_t defsz = 64 * (1 << 20); /* 64MB default size */
13709 +
13710 +       if (swiotlb_force == 1) {
13711 +               swiotlb = 1;
13712 +       } else if ((swiotlb_force != -1) &&
13713 +                  is_running_on_xen() &&
13714 +                  is_initial_xendomain()) {
13715 +               /* Domain 0 always has a swiotlb. */
13716 +               ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
13717 +               if (ram_end <= 0x7ffff)
13718 +                       defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
13719 +               swiotlb = 1;
13720 +       }
13721 +
13722 +       if (swiotlb)
13723 +               swiotlb_init_with_default_size(defsz);
13724 +       else
13725 +               printk(KERN_INFO "Software IO TLB disabled\n");
13726 +}
13727 +
13728 +/*
13729 + * We use __copy_to_user_inatomic to transfer to the host buffer because the
13730 + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
13731 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
13732 + * unnecessary copy from the aperture to the host buffer, and a page fault.
13733 + */
13734 +static void
13735 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
13736 +{
13737 +       if (PageHighMem(buffer.page)) {
13738 +               size_t len, bytes;
13739 +               char *dev, *host, *kmp;
13740 +               len = size;
13741 +               while (len != 0) {
13742 +                       if (((bytes = len) + buffer.offset) > PAGE_SIZE)
13743 +                               bytes = PAGE_SIZE - buffer.offset;
13744 +                       kmp  = kmap_atomic(buffer.page, KM_SWIOTLB);
13745 +                       dev  = dma_addr + size - len;
13746 +                       host = kmp + buffer.offset;
13747 +                       if (dir == DMA_FROM_DEVICE) {
13748 +                               if (__copy_to_user_inatomic(host, dev, bytes))
13749 +                                       /* inaccessible */;
13750 +                       } else
13751 +                               memcpy(dev, host, bytes);
13752 +                       kunmap_atomic(kmp, KM_SWIOTLB);
13753 +                       len -= bytes;
13754 +                       buffer.page++;
13755 +                       buffer.offset = 0;
13756 +               }
13757 +       } else {
13758 +               char *host = (char *)phys_to_virt(
13759 +                       page_to_pseudophys(buffer.page)) + buffer.offset;
13760 +               if (dir == DMA_FROM_DEVICE) {
13761 +                       if (__copy_to_user_inatomic(host, dma_addr, size))
13762 +                               /* inaccessible */;
13763 +               } else if (dir == DMA_TO_DEVICE)
13764 +                       memcpy(dma_addr, host, size);
13765 +       }
13766 +}
13767 +
13768 +/*
13769 + * Allocates bounce buffer and returns its kernel virtual address.
13770 + */
13771 +static void *
13772 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
13773 +{
13774 +       unsigned long flags;
13775 +       char *dma_addr;
13776 +       unsigned int nslots, stride, index, wrap;
13777 +       int i;
13778 +
13779 +       /*
13780 +        * For mappings greater than a page, we limit the stride (and
13781 +        * hence alignment) to a page size.
13782 +        */
13783 +       nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
13784 +       if (size > PAGE_SIZE)
13785 +               stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
13786 +       else
13787 +               stride = 1;
13788 +
13789 +       BUG_ON(!nslots);
13790 +
13791 +       /*
13792 +        * Find suitable number of IO TLB entries size that will fit this
13793 +        * request and allocate a buffer from that IO TLB pool.
13794 +        */
13795 +       spin_lock_irqsave(&io_tlb_lock, flags);
13796 +       {
13797 +               wrap = index = ALIGN(io_tlb_index, stride);
13798 +
13799 +               if (index >= iotlb_nslabs)
13800 +                       wrap = index = 0;
13801 +
13802 +               do {
13803 +                       /*
13804 +                        * If we find a slot that indicates we have 'nslots'
13805 +                        * number of contiguous buffers, we allocate the
13806 +                        * buffers from that slot and mark the entries as '0'
13807 +                        * indicating unavailable.
13808 +                        */
13809 +                       if (io_tlb_list[index] >= nslots) {
13810 +                               int count = 0;
13811 +
13812 +                               for (i = index; i < (int)(index + nslots); i++)
13813 +                                       io_tlb_list[i] = 0;
13814 +                               for (i = index - 1;
13815 +                                    (OFFSET(i, IO_TLB_SEGSIZE) !=
13816 +                                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13817 +                                    i--)
13818 +                                       io_tlb_list[i] = ++count;
13819 +                               dma_addr = iotlb_virt_start +
13820 +                                       (index << IO_TLB_SHIFT);
13821 +
13822 +                               /*
13823 +                                * Update the indices to avoid searching in
13824 +                                * the next round.
13825 +                                */
13826 +                               io_tlb_index = 
13827 +                                       ((index + nslots) < iotlb_nslabs
13828 +                                        ? (index + nslots) : 0);
13829 +
13830 +                               goto found;
13831 +                       }
13832 +                       index += stride;
13833 +                       if (index >= iotlb_nslabs)
13834 +                               index = 0;
13835 +               } while (index != wrap);
13836 +
13837 +               spin_unlock_irqrestore(&io_tlb_lock, flags);
13838 +               return NULL;
13839 +       }
13840 +  found:
13841 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
13842 +
13843 +       /*
13844 +        * Save away the mapping from the original address to the DMA address.
13845 +        * This is needed when we sync the memory.  Then we sync the buffer if
13846 +        * needed.
13847 +        */
13848 +       io_tlb_orig_addr[index] = buffer;
13849 +       if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13850 +               __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
13851 +
13852 +       return dma_addr;
13853 +}
13854 +
13855 +/*
13856 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
13857 + */
13858 +static void
13859 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13860 +{
13861 +       unsigned long flags;
13862 +       int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
13863 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13864 +       struct phys_addr buffer = io_tlb_orig_addr[index];
13865 +
13866 +       /*
13867 +        * First, sync the memory before unmapping the entry
13868 +        */
13869 +       if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13870 +               __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
13871 +
13872 +       /*
13873 +        * Return the buffer to the free list by setting the corresponding
13874 +        * entries to indicate the number of contigous entries available.
13875 +        * While returning the entries to the free list, we merge the entries
13876 +        * with slots below and above the pool being returned.
13877 +        */
13878 +       spin_lock_irqsave(&io_tlb_lock, flags);
13879 +       {
13880 +               count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
13881 +                        io_tlb_list[index + nslots] : 0);
13882 +               /*
13883 +                * Step 1: return the slots to the free list, merging the
13884 +                * slots with superceeding slots
13885 +                */
13886 +               for (i = index + nslots - 1; i >= index; i--)
13887 +                       io_tlb_list[i] = ++count;
13888 +               /*
13889 +                * Step 2: merge the returned slots with the preceding slots,
13890 +                * if available (non zero)
13891 +                */
13892 +               for (i = index - 1;
13893 +                    (OFFSET(i, IO_TLB_SEGSIZE) !=
13894 +                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13895 +                    i--)
13896 +                       io_tlb_list[i] = ++count;
13897 +       }
13898 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
13899 +}
13900 +
13901 +static void
13902 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13903 +{
13904 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13905 +       struct phys_addr buffer = io_tlb_orig_addr[index];
13906 +       BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
13907 +       __sync_single(buffer, dma_addr, size, dir);
13908 +}
13909 +
13910 +static void
13911 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
13912 +{
13913 +       /*
13914 +        * Ran out of IOMMU space for this operation. This is very bad.
13915 +        * Unfortunately the drivers cannot handle this operation properly.
13916 +        * unless they check for pci_dma_mapping_error (most don't)
13917 +        * When the mapping is small enough return a static buffer to limit
13918 +        * the damage, or panic when the transfer is too big.
13919 +        */
13920 +       printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
13921 +              "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
13922 +
13923 +       if (size > io_tlb_overflow && do_panic) {
13924 +               if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13925 +                       panic("PCI-DMA: Memory would be corrupted\n");
13926 +               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13927 +                       panic("PCI-DMA: Random memory would be DMAed\n");
13928 +       }
13929 +}
13930 +
13931 +/*
13932 + * Map a single buffer of the indicated size for DMA in streaming mode.  The
13933 + * PCI address to use is returned.
13934 + *
13935 + * Once the device is given the dma address, the device owns this memory until
13936 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
13937 + */
13938 +dma_addr_t
13939 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
13940 +{
13941 +       dma_addr_t dev_addr = virt_to_bus(ptr);
13942 +       void *map;
13943 +       struct phys_addr buffer;
13944 +
13945 +       BUG_ON(dir == DMA_NONE);
13946 +
13947 +       /*
13948 +        * If the pointer passed in happens to be in the device's DMA window,
13949 +        * we can safely return the device addr and not worry about bounce
13950 +        * buffering it.
13951 +        */
13952 +       if (!range_straddles_page_boundary(ptr, size) &&
13953 +           !address_needs_mapping(hwdev, dev_addr))
13954 +               return dev_addr;
13955 +
13956 +       /*
13957 +        * Oh well, have to allocate and map a bounce buffer.
13958 +        */
13959 +       buffer.page   = virt_to_page(ptr);
13960 +       buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
13961 +       map = map_single(hwdev, buffer, size, dir);
13962 +       if (!map) {
13963 +               swiotlb_full(hwdev, size, dir, 1);
13964 +               map = io_tlb_overflow_buffer;
13965 +       }
13966 +
13967 +       dev_addr = virt_to_bus(map);
13968 +       return dev_addr;
13969 +}
13970 +
13971 +/*
13972 + * Unmap a single streaming mode DMA translation.  The dma_addr and size must
13973 + * match what was provided for in a previous swiotlb_map_single call.  All
13974 + * other usages are undefined.
13975 + *
13976 + * After this call, reads by the cpu to the buffer are guaranteed to see
13977 + * whatever the device wrote there.
13978 + */
13979 +void
13980 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
13981 +                    int dir)
13982 +{
13983 +       BUG_ON(dir == DMA_NONE);
13984 +       if (in_swiotlb_aperture(dev_addr))
13985 +               unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
13986 +}
13987 +
13988 +/*
13989 + * Make physical memory consistent for a single streaming mode DMA translation
13990 + * after a transfer.
13991 + *
13992 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
13993 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
13994 + * call this function before doing so.  At the next point you give the PCI dma
13995 + * address back to the card, you must first perform a
13996 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
13997 + */
13998 +void
13999 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
14000 +                           size_t size, int dir)
14001 +{
14002 +       BUG_ON(dir == DMA_NONE);
14003 +       if (in_swiotlb_aperture(dev_addr))
14004 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
14005 +}
14006 +
14007 +void
14008 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
14009 +                              size_t size, int dir)
14010 +{
14011 +       BUG_ON(dir == DMA_NONE);
14012 +       if (in_swiotlb_aperture(dev_addr))
14013 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
14014 +}
14015 +
14016 +/*
14017 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
14018 + * This is the scatter-gather version of the above swiotlb_map_single
14019 + * interface.  Here the scatter gather list elements are each tagged with the
14020 + * appropriate dma address and length.  They are obtained via
14021 + * sg_dma_{address,length}(SG).
14022 + *
14023 + * NOTE: An implementation may be able to use a smaller number of
14024 + *       DMA address/length pairs than there are SG table elements.
14025 + *       (for example via virtual mapping capabilities)
14026 + *       The routine returns the number of addr/length pairs actually
14027 + *       used, at most nents.
14028 + *
14029 + * Device ownership issues as mentioned above for swiotlb_map_single are the
14030 + * same here.
14031 + */
14032 +int
14033 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
14034 +              int dir)
14035 +{
14036 +       struct phys_addr buffer;
14037 +       dma_addr_t dev_addr;
14038 +       char *map;
14039 +       int i;
14040 +
14041 +       BUG_ON(dir == DMA_NONE);
14042 +
14043 +       for (i = 0; i < nelems; i++, sg++) {
14044 +               dev_addr = SG_ENT_PHYS_ADDRESS(sg);
14045 +               if (address_needs_mapping(hwdev, dev_addr)) {
14046 +                       buffer.page   = sg->page;
14047 +                       buffer.offset = sg->offset;
14048 +                       map = map_single(hwdev, buffer, sg->length, dir);
14049 +                       if (!map) {
14050 +                               /* Don't panic here, we expect map_sg users
14051 +                                  to do proper error handling. */
14052 +                               swiotlb_full(hwdev, sg->length, dir, 0);
14053 +                               swiotlb_unmap_sg(hwdev, sg - i, i, dir);
14054 +                               sg[0].dma_length = 0;
14055 +                               return 0;
14056 +                       }
14057 +                       sg->dma_address = (dma_addr_t)virt_to_bus(map);
14058 +               } else
14059 +                       sg->dma_address = dev_addr;
14060 +               sg->dma_length = sg->length;
14061 +       }
14062 +       return nelems;
14063 +}
14064 +
14065 +/*
14066 + * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
14067 + * concerning calls here are the same as for swiotlb_unmap_single() above.
14068 + */
14069 +void
14070 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
14071 +                int dir)
14072 +{
14073 +       int i;
14074 +
14075 +       BUG_ON(dir == DMA_NONE);
14076 +
14077 +       for (i = 0; i < nelems; i++, sg++)
14078 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14079 +                       unmap_single(hwdev, 
14080 +                                    (void *)bus_to_virt(sg->dma_address),
14081 +                                    sg->dma_length, dir);
14082 +}
14083 +
14084 +/*
14085 + * Make physical memory consistent for a set of streaming mode DMA translations
14086 + * after a transfer.
14087 + *
14088 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
14089 + * and usage.
14090 + */
14091 +void
14092 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
14093 +                       int nelems, int dir)
14094 +{
14095 +       int i;
14096 +
14097 +       BUG_ON(dir == DMA_NONE);
14098 +
14099 +       for (i = 0; i < nelems; i++, sg++)
14100 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14101 +                       sync_single(hwdev,
14102 +                                   (void *)bus_to_virt(sg->dma_address),
14103 +                                   sg->dma_length, dir);
14104 +}
14105 +
14106 +void
14107 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
14108 +                          int nelems, int dir)
14109 +{
14110 +       int i;
14111 +
14112 +       BUG_ON(dir == DMA_NONE);
14113 +
14114 +       for (i = 0; i < nelems; i++, sg++)
14115 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14116 +                       sync_single(hwdev,
14117 +                                   (void *)bus_to_virt(sg->dma_address),
14118 +                                   sg->dma_length, dir);
14119 +}
14120 +
14121 +dma_addr_t
14122 +swiotlb_map_page(struct device *hwdev, struct page *page,
14123 +                unsigned long offset, size_t size,
14124 +                enum dma_data_direction direction)
14125 +{
14126 +       struct phys_addr buffer;
14127 +       dma_addr_t dev_addr;
14128 +       char *map;
14129 +
14130 +       dev_addr = page_to_bus(page) + offset;
14131 +       if (address_needs_mapping(hwdev, dev_addr)) {
14132 +               buffer.page   = page;
14133 +               buffer.offset = offset;
14134 +               map = map_single(hwdev, buffer, size, direction);
14135 +               if (!map) {
14136 +                       swiotlb_full(hwdev, size, direction, 1);
14137 +                       map = io_tlb_overflow_buffer;
14138 +               }
14139 +               dev_addr = (dma_addr_t)virt_to_bus(map);
14140 +       }
14141 +
14142 +       return dev_addr;
14143 +}
14144 +
14145 +void
14146 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
14147 +                  size_t size, enum dma_data_direction direction)
14148 +{
14149 +       BUG_ON(!valid_dma_direction(direction));
14150 +       if (in_swiotlb_aperture(dma_address))
14151 +               unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
14152 +}
14153 +
14154 +int
14155 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
14156 +{
14157 +       return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
14158 +}
14159 +
14160 +/*
14161 + * Return whether the given PCI device DMA address mask can be supported
14162 + * properly.  For example, if your device can only drive the low 24-bits
14163 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
14164 + * this function.
14165 + */
14166 +int
14167 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
14168 +{
14169 +       return (mask >= ((1UL << dma_bits) - 1));
14170 +}
14171 +
14172 +EXPORT_SYMBOL(swiotlb_init);
14173 +EXPORT_SYMBOL(swiotlb_map_single);
14174 +EXPORT_SYMBOL(swiotlb_unmap_single);
14175 +EXPORT_SYMBOL(swiotlb_map_sg);
14176 +EXPORT_SYMBOL(swiotlb_unmap_sg);
14177 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
14178 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
14179 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
14180 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
14181 +EXPORT_SYMBOL(swiotlb_map_page);
14182 +EXPORT_SYMBOL(swiotlb_unmap_page);
14183 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
14184 +EXPORT_SYMBOL(swiotlb_dma_supported);
14185 diff -ruNp linux-2.6.19/arch/i386/kernel/sysenter.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/sysenter.c
14186 --- linux-2.6.19/arch/i386/kernel/sysenter.c    2006-11-29 21:57:37.000000000 +0000
14187 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/sysenter.c  2007-02-02 19:10:21.000000000 +0000
14188 @@ -23,6 +23,10 @@
14189  #include <asm/pgtable.h>
14190  #include <asm/unistd.h>
14191  
14192 +#ifdef CONFIG_XEN
14193 +#include <xen/interface/callback.h>
14194 +#endif
14195 +
14196  /*
14197   * Should the kernel map a VDSO page into processes and pass its
14198   * address down to glibc upon exec()?
14199 @@ -44,6 +48,7 @@ extern asmlinkage void sysenter_entry(vo
14200  
14201  void enable_sep_cpu(void)
14202  {
14203 +#ifndef CONFIG_X86_NO_TSS
14204         int cpu = get_cpu();
14205         struct tss_struct *tss = &per_cpu(init_tss, cpu);
14206  
14207 @@ -58,6 +63,7 @@ void enable_sep_cpu(void)
14208         wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
14209         wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
14210         put_cpu();      
14211 +#endif
14212  }
14213  
14214  /*
14215 @@ -72,6 +78,18 @@ int __init sysenter_setup(void)
14216  {
14217         syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
14218  
14219 +#ifdef CONFIG_XEN
14220 +       if (boot_cpu_has(X86_FEATURE_SEP)) {
14221 +               static struct callback_register __initdata sysenter = {
14222 +                       .type = CALLBACKTYPE_sysenter,
14223 +                       .address = { __KERNEL_CS, (unsigned long)sysenter_entry },
14224 +               };
14225 +
14226 +               if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
14227 +                       clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
14228 +       }
14229 +#endif
14230 +
14231  #ifdef CONFIG_COMPAT_VDSO
14232         __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
14233         printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
14234 @@ -79,8 +97,12 @@ int __init sysenter_setup(void)
14235         /*
14236          * In the non-compat case the ELF coredumping code needs the fixmap:
14237          */
14238 +#ifdef CONFIG_XEN
14239 +       __set_fixmap(FIX_VDSO, virt_to_machine(syscall_page), PAGE_KERNEL_RO);
14240 +#else
14241         __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO);
14242  #endif
14243 +#endif
14244  
14245         if (!boot_cpu_has(X86_FEATURE_SEP)) {
14246                 memcpy(syscall_page,
14247 diff -ruNp linux-2.6.19/arch/i386/kernel/time-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/time-xen.c
14248 --- linux-2.6.19/arch/i386/kernel/time-xen.c    1970-01-01 00:00:00.000000000 +0000
14249 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/time-xen.c  2007-02-02 19:10:21.000000000 +0000
14250 @@ -0,0 +1,1121 @@
14251 +/*
14252 + *  linux/arch/i386/kernel/time.c
14253 + *
14254 + *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
14255 + *
14256 + * This file contains the PC-specific time handling details:
14257 + * reading the RTC at bootup, etc..
14258 + * 1994-07-02    Alan Modra
14259 + *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
14260 + * 1995-03-26    Markus Kuhn
14261 + *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
14262 + *      precision CMOS clock update
14263 + * 1996-05-03    Ingo Molnar
14264 + *      fixed time warps in do_[slow|fast]_gettimeoffset()
14265 + * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
14266 + *             "A Kernel Model for Precision Timekeeping" by Dave Mills
14267 + * 1998-09-05    (Various)
14268 + *     More robust do_fast_gettimeoffset() algorithm implemented
14269 + *     (works with APM, Cyrix 6x86MX and Centaur C6),
14270 + *     monotonic gettimeofday() with fast_get_timeoffset(),
14271 + *     drift-proof precision TSC calibration on boot
14272 + *     (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
14273 + *     Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
14274 + *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
14275 + * 1998-12-16    Andrea Arcangeli
14276 + *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
14277 + *     because was not accounting lost_ticks.
14278 + * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
14279 + *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
14280 + *     serialize accesses to xtime/lost_ticks).
14281 + */
14282 +
14283 +#include <linux/errno.h>
14284 +#include <linux/sched.h>
14285 +#include <linux/kernel.h>
14286 +#include <linux/param.h>
14287 +#include <linux/string.h>
14288 +#include <linux/mm.h>
14289 +#include <linux/interrupt.h>
14290 +#include <linux/time.h>
14291 +#include <linux/delay.h>
14292 +#include <linux/init.h>
14293 +#include <linux/smp.h>
14294 +#include <linux/module.h>
14295 +#include <linux/sysdev.h>
14296 +#include <linux/bcd.h>
14297 +#include <linux/efi.h>
14298 +#include <linux/mca.h>
14299 +#include <linux/sysctl.h>
14300 +#include <linux/percpu.h>
14301 +#include <linux/kernel_stat.h>
14302 +#include <linux/posix-timers.h>
14303 +
14304 +#include <asm/io.h>
14305 +#include <asm/smp.h>
14306 +#include <asm/irq.h>
14307 +#include <asm/msr.h>
14308 +#include <asm/delay.h>
14309 +#include <asm/mpspec.h>
14310 +#include <asm/uaccess.h>
14311 +#include <asm/processor.h>
14312 +#include <asm/timer.h>
14313 +#include <asm/sections.h>
14314 +
14315 +#include "mach_time.h"
14316 +
14317 +#include <linux/timex.h>
14318 +
14319 +#include <asm/hpet.h>
14320 +
14321 +#include <asm/arch_hooks.h>
14322 +
14323 +#include <xen/evtchn.h>
14324 +#include <xen/interface/vcpu.h>
14325 +
14326 +#if defined (__i386__)
14327 +#include <asm/i8259.h>
14328 +#endif
14329 +
14330 +int pit_latch_buggy;              /* extern */
14331 +
14332 +#if defined(__x86_64__)
14333 +unsigned long vxtime_hz = PIT_TICK_RATE;
14334 +struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
14335 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
14336 +struct timespec __xtime __section_xtime;
14337 +struct timezone __sys_tz __section_sys_tz;
14338 +#endif
14339 +
14340 +#define USEC_PER_TICK (USEC_PER_SEC / HZ)
14341 +#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
14342 +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
14343 +
14344 +#define NS_SCALE       10 /* 2^10, carefully chosen */
14345 +#define US_SCALE       32 /* 2^32, arbitralrily chosen */
14346 +
14347 +unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
14348 +EXPORT_SYMBOL(cpu_khz);
14349 +
14350 +DEFINE_SPINLOCK(rtc_lock);
14351 +EXPORT_SYMBOL(rtc_lock);
14352 +
14353 +extern struct init_timer_opts timer_tsc_init;
14354 +extern struct timer_opts timer_tsc;
14355 +#define timer_none timer_tsc
14356 +
14357 +/* These are peridically updated in shared_info, and then copied here. */
14358 +struct shadow_time_info {
14359 +       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
14360 +       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
14361 +       u32 tsc_to_nsec_mul;
14362 +       u32 tsc_to_usec_mul;
14363 +       int tsc_shift;
14364 +       u32 version;
14365 +};
14366 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
14367 +static struct timespec shadow_tv;
14368 +static u32 shadow_tv_version;
14369 +
14370 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
14371 +static u64 processed_system_time;   /* System time (ns) at last processing. */
14372 +static DEFINE_PER_CPU(u64, processed_system_time);
14373 +
14374 +/* How much CPU time was spent blocked and how much was 'stolen'? */
14375 +static DEFINE_PER_CPU(u64, processed_stolen_time);
14376 +static DEFINE_PER_CPU(u64, processed_blocked_time);
14377 +
14378 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
14379 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
14380 +
14381 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
14382 +#define NS_PER_TICK (1000000000LL/HZ)
14383 +
14384 +static inline void __normalize_time(time_t *sec, s64 *nsec)
14385 +{
14386 +       while (*nsec >= NSEC_PER_SEC) {
14387 +               (*nsec) -= NSEC_PER_SEC;
14388 +               (*sec)++;
14389 +       }
14390 +       while (*nsec < 0) {
14391 +               (*nsec) += NSEC_PER_SEC;
14392 +               (*sec)--;
14393 +       }
14394 +}
14395 +
14396 +/* Does this guest OS track Xen time, or set its wall clock independently? */
14397 +static int independent_wallclock = 0;
14398 +static int __init __independent_wallclock(char *str)
14399 +{
14400 +       independent_wallclock = 1;
14401 +       return 1;
14402 +}
14403 +__setup("independent_wallclock", __independent_wallclock);
14404 +
14405 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
14406 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
14407 +static int __init __permitted_clock_jitter(char *str)
14408 +{
14409 +       permitted_clock_jitter = simple_strtoul(str, NULL, 0);
14410 +       return 1;
14411 +}
14412 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
14413 +
14414 +#ifndef CONFIG_X86
14415 +int tsc_disable __devinitdata = 0;
14416 +#endif
14417 +
14418 +static void delay_tsc(unsigned long loops)
14419 +{
14420 +       unsigned long bclock, now;
14421 +
14422 +       rdtscl(bclock);
14423 +       do {
14424 +               rep_nop();
14425 +               rdtscl(now);
14426 +       } while ((now - bclock) < loops);
14427 +}
14428 +
14429 +struct timer_opts timer_tsc = {
14430 +       .name = "tsc",
14431 +       .delay = delay_tsc,
14432 +};
14433 +
14434 +/*
14435 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
14436 + * yielding a 64-bit result.
14437 + */
14438 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
14439 +{
14440 +       u64 product;
14441 +#ifdef __i386__
14442 +       u32 tmp1, tmp2;
14443 +#endif
14444 +
14445 +       if (shift < 0)
14446 +               delta >>= -shift;
14447 +       else
14448 +               delta <<= shift;
14449 +
14450 +#ifdef __i386__
14451 +       __asm__ (
14452 +               "mul  %5       ; "
14453 +               "mov  %4,%%eax ; "
14454 +               "mov  %%edx,%4 ; "
14455 +               "mul  %5       ; "
14456 +               "xor  %5,%5    ; "
14457 +               "add  %4,%%eax ; "
14458 +               "adc  %5,%%edx ; "
14459 +               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
14460 +               : "a" ((u32)delta), "1" ((u32)(delta >> US_SCALE)), "2" (mul_frac) );
14461 +#else
14462 +       __asm__ (
14463 +               "mul %%rdx ; shrd $32,%%rdx,%%rax"
14464 +               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
14465 +#endif
14466 +
14467 +       return product;
14468 +}
14469 +
14470 +#if defined (__i386__)
14471 +int read_current_timer(unsigned long *timer_val)
14472 +{
14473 +       rdtscl(*timer_val);
14474 +       return 0;
14475 +}
14476 +#endif
14477 +
14478 +void init_cpu_khz(void)
14479 +{
14480 +       u64 __cpu_khz = 1000000ULL << US_SCALE;
14481 +       struct vcpu_time_info *info;
14482 +       info = &HYPERVISOR_shared_info->vcpu_info[0].time;
14483 +       do_div(__cpu_khz, info->tsc_to_system_mul);
14484 +       if (info->tsc_shift < 0)
14485 +               cpu_khz = __cpu_khz << -info->tsc_shift;
14486 +       else
14487 +               cpu_khz = __cpu_khz >> info->tsc_shift;
14488 +}
14489 +
14490 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
14491 +{
14492 +       u64 now, delta;
14493 +       rdtscll(now);
14494 +       delta = now - shadow->tsc_timestamp;
14495 +       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
14496 +}
14497 +
14498 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
14499 +{
14500 +       u64 now, delta;
14501 +       rdtscll(now);
14502 +       delta = now - shadow->tsc_timestamp;
14503 +       return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
14504 +}
14505 +
14506 +static void __update_wallclock(time_t sec, long nsec)
14507 +{
14508 +       long wtm_nsec, xtime_nsec;
14509 +       time_t wtm_sec, xtime_sec;
14510 +       u64 tmp, wc_nsec;
14511 +
14512 +       /* Adjust wall-clock time base based on jiffies ticks. */
14513 +       wc_nsec = processed_system_time;
14514 +       wc_nsec += sec * (u64)NSEC_PER_SEC;
14515 +       wc_nsec += nsec;
14516 +
14517 +       /* Split wallclock base into seconds and nanoseconds. */
14518 +       tmp = wc_nsec;
14519 +       xtime_nsec = do_div(tmp, 1000000000);
14520 +       xtime_sec  = (time_t)tmp;
14521 +
14522 +       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
14523 +       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
14524 +
14525 +       set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
14526 +       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
14527 +
14528 +       ntp_clear();
14529 +}
14530 +
14531 +static void update_wallclock(void)
14532 +{
14533 +       shared_info_t *s = HYPERVISOR_shared_info;
14534 +
14535 +       do {
14536 +               shadow_tv_version = s->wc_version;
14537 +               rmb();
14538 +               shadow_tv.tv_sec  = s->wc_sec;
14539 +               shadow_tv.tv_nsec = s->wc_nsec;
14540 +               rmb();
14541 +       } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
14542 +
14543 +       if (!independent_wallclock)
14544 +               __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
14545 +}
14546 +
14547 +/*
14548 + * Reads a consistent set of time-base values from Xen, into a shadow data
14549 + * area.
14550 + */
14551 +static void get_time_values_from_xen(void)
14552 +{
14553 +       shared_info_t           *s = HYPERVISOR_shared_info;
14554 +       struct vcpu_time_info   *src;
14555 +       struct shadow_time_info *dst;
14556 +
14557 +       src = &s->vcpu_info[smp_processor_id()].time;
14558 +       dst = &per_cpu(shadow_time, smp_processor_id());
14559 +
14560 +       do {
14561 +               dst->version = src->version;
14562 +               rmb();
14563 +               dst->tsc_timestamp     = src->tsc_timestamp;
14564 +               dst->system_timestamp  = src->system_time;
14565 +               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
14566 +               dst->tsc_shift         = src->tsc_shift;
14567 +               rmb();
14568 +       } while ((src->version & 1) | (dst->version ^ src->version));
14569 +
14570 +       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
14571 +}
14572 +
14573 +static inline int time_values_up_to_date(int cpu)
14574 +{
14575 +       struct vcpu_time_info   *src;
14576 +       struct shadow_time_info *dst;
14577 +
14578 +       src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
14579 +       dst = &per_cpu(shadow_time, cpu);
14580 +
14581 +       rmb();
14582 +       return (dst->version == src->version);
14583 +}
14584 +
14585 +/*
14586 + * This is a special lock that is owned by the CPU and holds the index
14587 + * register we are working with.  It is required for NMI access to the
14588 + * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
14589 + */
14590 +volatile unsigned long cmos_lock = 0;
14591 +EXPORT_SYMBOL(cmos_lock);
14592 +
14593 +/* Routines for accessing the CMOS RAM/RTC. */
14594 +unsigned char rtc_cmos_read(unsigned char addr)
14595 +{
14596 +       unsigned char val;
14597 +       lock_cmos_prefix(addr);
14598 +       outb_p(addr, RTC_PORT(0));
14599 +       val = inb_p(RTC_PORT(1));
14600 +       lock_cmos_suffix(addr);
14601 +       return val;
14602 +}
14603 +EXPORT_SYMBOL(rtc_cmos_read);
14604 +
14605 +void rtc_cmos_write(unsigned char val, unsigned char addr)
14606 +{
14607 +       lock_cmos_prefix(addr);
14608 +       outb_p(addr, RTC_PORT(0));
14609 +       outb_p(val, RTC_PORT(1));
14610 +       lock_cmos_suffix(addr);
14611 +}
14612 +EXPORT_SYMBOL(rtc_cmos_write);
14613 +
14614 +/*
14615 + * This version of gettimeofday has microsecond resolution
14616 + * and better than microsecond precision on fast x86 machines with TSC.
14617 + */
14618 +void do_gettimeofday(struct timeval *tv)
14619 +{
14620 +       unsigned long seq;
14621 +       unsigned long usec, sec;
14622 +       unsigned long max_ntp_tick;
14623 +       s64 nsec;
14624 +       unsigned int cpu;
14625 +       struct shadow_time_info *shadow;
14626 +       u32 local_time_version;
14627 +
14628 +       cpu = get_cpu();
14629 +       shadow = &per_cpu(shadow_time, cpu);
14630 +
14631 +       do {
14632 +               local_time_version = shadow->version;
14633 +               seq = read_seqbegin(&xtime_lock);
14634 +
14635 +               usec = get_usec_offset(shadow);
14636 +
14637 +               /*
14638 +                * If time_adjust is negative then NTP is slowing the clock
14639 +                * so make sure not to go into next possible interval.
14640 +                * Better to lose some accuracy than have time go backwards..
14641 +                */
14642 +               if (unlikely(time_adjust < 0)) {
14643 +                       max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
14644 +                       usec = min(usec, max_ntp_tick);
14645 +               }
14646 +
14647 +               sec = xtime.tv_sec;
14648 +               usec += (xtime.tv_nsec / NSEC_PER_USEC);
14649 +
14650 +               nsec = shadow->system_timestamp - processed_system_time;
14651 +               __normalize_time(&sec, &nsec);
14652 +               usec += (long)nsec / NSEC_PER_USEC;
14653 +
14654 +               if (unlikely(!time_values_up_to_date(cpu))) {
14655 +                       /*
14656 +                        * We may have blocked for a long time,
14657 +                        * rendering our calculations invalid
14658 +                        * (e.g. the time delta may have
14659 +                        * overflowed). Detect that and recalculate
14660 +                        * with fresh values.
14661 +                        */
14662 +                       get_time_values_from_xen();
14663 +                       continue;
14664 +               }
14665 +       } while (read_seqretry(&xtime_lock, seq) ||
14666 +                (local_time_version != shadow->version));
14667 +
14668 +       put_cpu();
14669 +
14670 +       while (usec >= USEC_PER_SEC) {
14671 +               usec -= USEC_PER_SEC;
14672 +               sec++;
14673 +       }
14674 +
14675 +       tv->tv_sec = sec;
14676 +       tv->tv_usec = usec;
14677 +}
14678 +
14679 +EXPORT_SYMBOL(do_gettimeofday);
14680 +
14681 +int do_settimeofday(struct timespec *tv)
14682 +{
14683 +       time_t sec;
14684 +       s64 nsec;
14685 +       unsigned int cpu;
14686 +       struct shadow_time_info *shadow;
14687 +       dom0_op_t op;
14688 +
14689 +       if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
14690 +               return -EINVAL;
14691 +
14692 +       cpu = get_cpu();
14693 +       shadow = &per_cpu(shadow_time, cpu);
14694 +
14695 +       write_seqlock_irq(&xtime_lock);
14696 +
14697 +       /*
14698 +        * Ensure we don't get blocked for a long time so that our time delta
14699 +        * overflows. If that were to happen then our shadow time values would
14700 +        * be stale, so we can retry with fresh ones.
14701 +        */
14702 +       for (;;) {
14703 +               nsec = tv->tv_nsec - get_nsec_offset(shadow);
14704 +               if (time_values_up_to_date(cpu))
14705 +                       break;
14706 +               get_time_values_from_xen();
14707 +       }
14708 +       sec = tv->tv_sec;
14709 +       __normalize_time(&sec, &nsec);
14710 +
14711 +       if (is_initial_xendomain() && !independent_wallclock) {
14712 +               op.cmd = DOM0_SETTIME;
14713 +               op.u.settime.secs        = sec;
14714 +               op.u.settime.nsecs       = nsec;
14715 +               op.u.settime.system_time = shadow->system_timestamp;
14716 +               HYPERVISOR_dom0_op(&op);
14717 +               update_wallclock();
14718 +       } else if (independent_wallclock) {
14719 +               nsec -= shadow->system_timestamp;
14720 +               __normalize_time(&sec, &nsec);
14721 +               __update_wallclock(sec, nsec);
14722 +       }
14723 +
14724 +       write_sequnlock_irq(&xtime_lock);
14725 +
14726 +       put_cpu();
14727 +
14728 +       clock_was_set();
14729 +       return 0;
14730 +}
14731 +
14732 +EXPORT_SYMBOL(do_settimeofday);
14733 +
14734 +static void sync_xen_wallclock(unsigned long dummy);
14735 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
14736 +static void sync_xen_wallclock(unsigned long dummy)
14737 +{
14738 +       time_t sec;
14739 +       s64 nsec;
14740 +       dom0_op_t op;
14741 +
14742 +       if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
14743 +               return;
14744 +
14745 +       write_seqlock_irq(&xtime_lock);
14746 +
14747 +       sec  = xtime.tv_sec;
14748 +       nsec = xtime.tv_nsec;
14749 +       __normalize_time(&sec, &nsec);
14750 +
14751 +       op.cmd = DOM0_SETTIME;
14752 +       op.u.settime.secs        = sec;
14753 +       op.u.settime.nsecs       = nsec;
14754 +       op.u.settime.system_time = processed_system_time;
14755 +       HYPERVISOR_dom0_op(&op);
14756 +
14757 +       update_wallclock();
14758 +
14759 +       write_sequnlock_irq(&xtime_lock);
14760 +
14761 +       /* Once per minute. */
14762 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
14763 +}
14764 +
14765 +static int set_rtc_mmss(unsigned long nowtime)
14766 +{
14767 +       int retval;
14768 +       unsigned long flags;
14769 +
14770 +       if (independent_wallclock || !is_initial_xendomain())
14771 +               return 0;
14772 +
14773 +       /* gets recalled with irq locally disabled */
14774 +       spin_lock_irqsave(&rtc_lock, flags);
14775 +       if (efi_enabled)
14776 +               retval = efi_set_rtc_mmss(nowtime);
14777 +       else
14778 +               retval = mach_set_rtc_mmss(nowtime);
14779 +       spin_unlock_irqrestore(&rtc_lock, flags);
14780 +
14781 +       return retval;
14782 +}
14783 +
14784 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
14785 + *             Note: This function is required to return accurate
14786 + *             time even in the absence of multiple timer ticks.
14787 + */
14788 +unsigned long long monotonic_clock(void)
14789 +{
14790 +       int cpu = get_cpu();
14791 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14792 +       u64 time;
14793 +       u32 local_time_version;
14794 +
14795 +       do {
14796 +               local_time_version = shadow->version;
14797 +               barrier();
14798 +               time = shadow->system_timestamp + get_nsec_offset(shadow);
14799 +               if (!time_values_up_to_date(cpu))
14800 +                       get_time_values_from_xen();
14801 +               barrier();
14802 +       } while (local_time_version != shadow->version);
14803 +
14804 +       put_cpu();
14805 +
14806 +       return time;
14807 +}
14808 +EXPORT_SYMBOL(monotonic_clock);
14809 +
14810 +unsigned long long sched_clock(void)
14811 +{
14812 +       return monotonic_clock();
14813 +}
14814 +
14815 +unsigned long profile_pc(struct pt_regs *regs)
14816 +{
14817 +       unsigned long pc = instruction_pointer(regs);
14818 +
14819 +#if defined(__x86_64__)
14820 +       /* Assume the lock function has either no stack frame or a copy
14821 +          of eflags from PUSHF
14822 +          Eflags always has bits 22 and up cleared unlike kernel addresses. */
14823 +       if (!user_mode_vm(regs) && in_lock_functions(pc)) {
14824 +               unsigned long *sp = (unsigned long *)regs->rsp;
14825 +               if (sp[0] >> 22)
14826 +                       return sp[0];
14827 +               if (sp[1] >> 22)
14828 +                       return sp[1];
14829 +       }
14830 +#else
14831 +#ifdef CONFIG_SMP
14832 +       if (!user_mode_vm(regs) && in_lock_functions(pc)) {
14833 +#ifdef CONFIG_FRAME_POINTER
14834 +               return *(unsigned long *)(regs->ebp + 4);
14835 +#else
14836 +                unsigned long *sp;
14837 +                if ((regs->xcs & 3) == 0)
14838 +                        sp = (unsigned long *)&regs->esp;
14839 +                else
14840 +                        sp = (unsigned long *)regs->esp;
14841 +                /* Return address is either directly at stack pointer
14842 +                   or above a saved eflags. Eflags has bits 22-31 zero,
14843 +                   kernel addresses don't. */
14844 +                if (sp[0] >> 22)
14845 +                        return sp[0];
14846 +                if (sp[1] >> 22)
14847 +                        return sp[1];
14848 +#endif /* CONFIG_FRAME_POINTER */
14849 +       }
14850 +#endif /* CONFIG_SMP */
14851 +#endif /* __x86_64__ */
14852 +       return pc;
14853 +}
14854 +EXPORT_SYMBOL(profile_pc);
14855 +
14856 +irqreturn_t timer_interrupt(int irq, void *dev_id)
14857 +{
14858 +       s64 delta, delta_cpu, stolen, blocked;
14859 +       u64 sched_time;
14860 +       int i, cpu = smp_processor_id();
14861 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14862 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14863 +
14864 +       write_seqlock(&xtime_lock);
14865 +
14866 +       do {
14867 +               get_time_values_from_xen();
14868 +
14869 +               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
14870 +               delta = delta_cpu =
14871 +                       shadow->system_timestamp + get_nsec_offset(shadow);
14872 +               delta     -= processed_system_time;
14873 +               delta_cpu -= per_cpu(processed_system_time, cpu);
14874 +
14875 +               /*
14876 +                * Obtain a consistent snapshot of stolen/blocked cycles. We
14877 +                * can use state_entry_time to detect if we get preempted here.
14878 +                */
14879 +               do {
14880 +                       sched_time = runstate->state_entry_time;
14881 +                       barrier();
14882 +                       stolen = runstate->time[RUNSTATE_runnable] +
14883 +                               runstate->time[RUNSTATE_offline] -
14884 +                               per_cpu(processed_stolen_time, cpu);
14885 +                       blocked = runstate->time[RUNSTATE_blocked] -
14886 +                               per_cpu(processed_blocked_time, cpu);
14887 +                       barrier();
14888 +               } while (sched_time != runstate->state_entry_time);
14889 +       } while (!time_values_up_to_date(cpu));
14890 +
14891 +       if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
14892 +            unlikely(delta_cpu < -(s64)permitted_clock_jitter))
14893 +           && printk_ratelimit()) {
14894 +               printk("Timer ISR/%d: Time went backwards: "
14895 +                      "delta=%lld delta_cpu=%lld shadow=%lld "
14896 +                      "off=%lld processed=%lld cpu_processed=%lld\n",
14897 +                      cpu, delta, delta_cpu, shadow->system_timestamp,
14898 +                      (s64)get_nsec_offset(shadow),
14899 +                      processed_system_time,
14900 +                      per_cpu(processed_system_time, cpu));
14901 +               for (i = 0; i < num_online_cpus(); i++)
14902 +                       printk(" %d: %lld\n", i,
14903 +                              per_cpu(processed_system_time, i));
14904 +       }
14905 +
14906 +       /* System-wide jiffy work. */
14907 +       while (delta >= NS_PER_TICK) {
14908 +               delta -= NS_PER_TICK;
14909 +               processed_system_time += NS_PER_TICK;
14910 +               do_timer(1);
14911 +       }
14912 +
14913 +       if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
14914 +               update_wallclock();
14915 +               clock_was_set();
14916 +       }
14917 +
14918 +       write_sequnlock(&xtime_lock);
14919 +
14920 +       /*
14921 +        * Account stolen ticks.
14922 +        * HACK: Passing NULL to account_steal_time()
14923 +        * ensures that the ticks are accounted as stolen.
14924 +        */
14925 +       if ((stolen > 0) && (delta_cpu > 0)) {
14926 +               delta_cpu -= stolen;
14927 +               if (unlikely(delta_cpu < 0))
14928 +                       stolen += delta_cpu; /* clamp local-time progress */
14929 +               do_div(stolen, NS_PER_TICK);
14930 +               per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
14931 +               per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
14932 +               account_steal_time(NULL, (cputime_t)stolen);
14933 +       }
14934 +
14935 +       /*
14936 +        * Account blocked ticks.
14937 +        * HACK: Passing idle_task to account_steal_time()
14938 +        * ensures that the ticks are accounted as idle/wait.
14939 +        */
14940 +       if ((blocked > 0) && (delta_cpu > 0)) {
14941 +               delta_cpu -= blocked;
14942 +               if (unlikely(delta_cpu < 0))
14943 +                       blocked += delta_cpu; /* clamp local-time progress */
14944 +               do_div(blocked, NS_PER_TICK);
14945 +               per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
14946 +               per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
14947 +               account_steal_time(idle_task(cpu), (cputime_t)blocked);
14948 +       }
14949 +
14950 +       /* Account user/system ticks. */
14951 +       if (delta_cpu > 0) {
14952 +               do_div(delta_cpu, NS_PER_TICK);
14953 +               per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
14954 +               if (user_mode(get_irq_regs()))
14955 +                       account_user_time(current, (cputime_t)delta_cpu);
14956 +               else
14957 +                       account_system_time(current, HARDIRQ_OFFSET,
14958 +                                           (cputime_t)delta_cpu);
14959 +       }
14960 +
14961 +       /* Offlined for more than a few seconds? Avoid lockup warnings. */
14962 +       if (stolen > 5*HZ)
14963 +               touch_softlockup_watchdog();
14964 +
14965 +       /* Local timer processing (see update_process_times()). */
14966 +       run_local_timers();
14967 +       if (rcu_pending(cpu))
14968 +               rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
14969 +       scheduler_tick();
14970 +       run_posix_cpu_timers(current);
14971 +//     JQ: Why this works on 2.6.16 & 2.6.18 and generates a page
14972 +//     fault on 2.6.19 is a mistery to me.
14973 +//     profile_tick(CPU_PROFILING);
14974 +
14975 +       return IRQ_HANDLED;
14976 +}
14977 +
14978 +static void init_missing_ticks_accounting(int cpu)
14979 +{
14980 +       struct vcpu_register_runstate_memory_area area;
14981 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14982 +
14983 +       memset(runstate, 0, sizeof(*runstate));
14984 +
14985 +       area.addr.v = runstate;
14986 +       HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
14987 +
14988 +       per_cpu(processed_blocked_time, cpu) =
14989 +               runstate->time[RUNSTATE_blocked];
14990 +       per_cpu(processed_stolen_time, cpu) =
14991 +               runstate->time[RUNSTATE_runnable] +
14992 +               runstate->time[RUNSTATE_offline];
14993 +}
14994 +
14995 +/* not static: needed by APM */
14996 +unsigned long get_cmos_time(void)
14997 +{
14998 +       unsigned long retval;
14999 +       unsigned long flags;
15000 +
15001 +       spin_lock_irqsave(&rtc_lock, flags);
15002 +
15003 +       if (efi_enabled)
15004 +               retval = efi_get_time();
15005 +       else
15006 +               retval = mach_get_cmos_time();
15007 +
15008 +       spin_unlock_irqrestore(&rtc_lock, flags);
15009 +
15010 +       return retval;
15011 +}
15012 +EXPORT_SYMBOL(get_cmos_time);
15013 +
15014 +static void sync_cmos_clock(unsigned long dummy);
15015 +
15016 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
15017 +
15018 +static void sync_cmos_clock(unsigned long dummy)
15019 +{
15020 +       struct timeval now, next;
15021 +       int fail = 1;
15022 +
15023 +       /*
15024 +        * If we have an externally synchronized Linux clock, then update
15025 +        * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
15026 +        * called as close as possible to 500 ms before the new second starts.
15027 +        * This code is run on a timer.  If the clock is set, that timer
15028 +        * may not expire at the correct time.  Thus, we adjust...
15029 +        */
15030 +       if (!ntp_synced())
15031 +               /*
15032 +                * Not synced, exit, do not restart a timer (if one is
15033 +                * running, let it run out).
15034 +                */
15035 +               return;
15036 +
15037 +       do_gettimeofday(&now);
15038 +       if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
15039 +           now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
15040 +               fail = set_rtc_mmss(now.tv_sec);
15041 +
15042 +       next.tv_usec = USEC_AFTER - now.tv_usec;
15043 +       if (next.tv_usec <= 0)
15044 +               next.tv_usec += USEC_PER_SEC;
15045 +
15046 +       if (!fail)
15047 +               next.tv_sec = 659;
15048 +       else
15049 +               next.tv_sec = 0;
15050 +
15051 +       if (next.tv_usec >= USEC_PER_SEC) {
15052 +               next.tv_sec++;
15053 +               next.tv_usec -= USEC_PER_SEC;
15054 +       }
15055 +       mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
15056 +}
15057 +
15058 +void notify_arch_cmos_timer(void)
15059 +{
15060 +       mod_timer(&sync_cmos_timer, jiffies + 1);
15061 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
15062 +}
15063 +
15064 +static long clock_cmos_diff;
15065 +static unsigned long sleep_start;
15066 +
15067 +static int timer_suspend(struct sys_device *dev, pm_message_t state)
15068 +{
15069 +       /*
15070 +        * Estimate time zone so that set_time can update the clock
15071 +        */
15072 +       unsigned long ctime =  get_cmos_time();
15073 +
15074 +       clock_cmos_diff = -ctime;
15075 +       clock_cmos_diff += get_seconds();
15076 +       sleep_start = ctime;
15077 +       return 0;
15078 +}
15079 +
15080 +static int timer_resume(struct sys_device *dev)
15081 +{
15082 +       unsigned long flags;
15083 +       unsigned long sec;
15084 +       unsigned long ctime = get_cmos_time();
15085 +       long sleep_length = (ctime - sleep_start) * HZ;
15086 +
15087 +       if (sleep_length < 0) {
15088 +               printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
15089 +               /* The time after the resume must not be earlier than the time
15090 +                * before the suspend or some nasty things will happen
15091 +                */
15092 +               sleep_length = 0;
15093 +               ctime = sleep_start;
15094 +       }
15095 +
15096 +#ifdef CONFIG_HPET_TIMER
15097 +       if (is_hpet_enabled())
15098 +               hpet_reenable();
15099 +#endif
15100 +       sec = ctime + clock_cmos_diff;
15101 +       write_seqlock_irqsave(&xtime_lock, flags);
15102 +       xtime.tv_sec = sec;
15103 +       xtime.tv_nsec = 0;
15104 +       jiffies_64 += sleep_length;
15105 +       write_sequnlock_irqrestore(&xtime_lock, flags);
15106 +       touch_softlockup_watchdog();
15107 +       return 0;
15108 +}
15109 +
15110 +static struct sysdev_class timer_sysclass = {
15111 +       .resume = timer_resume,
15112 +       .suspend = timer_suspend,
15113 +       set_kset_name("timer"),
15114 +};
15115 +
15116 +
15117 +/* XXX this driverfs stuff should probably go elsewhere later -john */
15118 +static struct sys_device device_timer = {
15119 +       .id     = 0,
15120 +       .cls    = &timer_sysclass,
15121 +};
15122 +
15123 +static int time_init_device(void)
15124 +{
15125 +       int error = sysdev_class_register(&timer_sysclass);
15126 +       if (!error)
15127 +               error = sysdev_register(&device_timer);
15128 +       return error;
15129 +}
15130 +
15131 +device_initcall(time_init_device);
15132 +
15133 +#ifdef CONFIG_HPET_TIMER
15134 +extern void (*late_time_init)(void);
15135 +/* Duplicate of time_init() below, with hpet_enable part added */
15136 +static void __init hpet_time_init(void)
15137 +{
15138 +       xtime.tv_sec = get_cmos_time();
15139 +       xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
15140 +       set_normalized_timespec(&wall_to_monotonic,
15141 +               -xtime.tv_sec, -xtime.tv_nsec);
15142 +
15143 +       if ((hpet_enable() >= 0) && hpet_use_timer) {
15144 +               printk("Using HPET for base-timer\n");
15145 +       }
15146 +
15147 +       time_init_hook();
15148 +}
15149 +#endif
15150 +
15151 +/* Dynamically-mapped IRQ. */
15152 +DEFINE_PER_CPU(int, timer_irq);
15153 +
15154 +extern void (*late_time_init)(void);
15155 +static void setup_cpu0_timer_irq(void)
15156 +{
15157 +       per_cpu(timer_irq, 0) =
15158 +               bind_virq_to_irqhandler(
15159 +                       VIRQ_TIMER,
15160 +                       0,
15161 +                       timer_interrupt,
15162 +                       SA_INTERRUPT,
15163 +                       "timer0",
15164 +                       NULL);
15165 +       BUG_ON(per_cpu(timer_irq, 0) < 0);
15166 +}
15167 +
15168 +void __init time_init(void)
15169 +{
15170 +#ifdef CONFIG_HPET_TIMER
15171 +       if (is_hpet_capable()) {
15172 +               /*
15173 +                * HPET initialization needs to do memory-mapped io. So, let
15174 +                * us do a late initialization after mem_init().
15175 +                */
15176 +               late_time_init = hpet_time_init;
15177 +               return;
15178 +       }
15179 +#endif
15180 +       get_time_values_from_xen();
15181 +
15182 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
15183 +       per_cpu(processed_system_time, 0) = processed_system_time;
15184 +       init_missing_ticks_accounting(0);
15185 +
15186 +       update_wallclock();
15187 +
15188 +       init_cpu_khz();
15189 +       printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
15190 +              cpu_khz / 1000, cpu_khz % 1000);
15191 +
15192 +#if defined(__x86_64__)
15193 +       vxtime.mode = VXTIME_TSC;
15194 +       vxtime.quot = (1000000L << US_SCALE) / vxtime_hz;
15195 +       vxtime.tsc_quot = (1000L << US_SCALE) / cpu_khz;
15196 +       sync_core();
15197 +       rdtscll(vxtime.last_tsc);
15198 +#endif
15199 +
15200 +       /* Cannot request_irq() until kmem is initialised. */
15201 +       late_time_init = setup_cpu0_timer_irq;
15202 +}
15203 +
15204 +/* Convert jiffies to system time. */
15205 +u64 jiffies_to_st(unsigned long j)
15206 +{
15207 +       unsigned long seq;
15208 +       long delta;
15209 +       u64 st;
15210 +
15211 +       do {
15212 +               seq = read_seqbegin(&xtime_lock);
15213 +               delta = j - jiffies;
15214 +               if (delta < 1) {
15215 +                       /* Triggers in some wrap-around cases, but that's okay:
15216 +                        * we just end up with a shorter timeout. */
15217 +                       st = processed_system_time + NS_PER_TICK;
15218 +               } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
15219 +                       /* Very long timeout means there is no pending timer.
15220 +                        * We indicate this to Xen by passing zero timeout. */
15221 +                       st = 0;
15222 +               } else {
15223 +                       st = processed_system_time + delta * (u64)NS_PER_TICK;
15224 +               }
15225 +       } while (read_seqretry(&xtime_lock, seq));
15226 +
15227 +       return st;
15228 +}
15229 +EXPORT_SYMBOL(jiffies_to_st);
15230 +
15231 +/*
15232 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
15233 + * These functions are based on implementations from arch/s390/kernel/time.c
15234 + */
15235 +static void stop_hz_timer(void)
15236 +{
15237 +       unsigned int cpu = smp_processor_id();
15238 +       unsigned long j;
15239 +
15240 +       cpu_set(cpu, nohz_cpu_mask);
15241 +
15242 +       /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
15243 +       /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
15244 +       /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
15245 +       /* stop the hz timer then the cpumasks created for subsequent values */
15246 +       /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
15247 +       /* nohz_cpu_mask and so will not depend on this cpu.                 */
15248 +
15249 +       smp_mb();
15250 +
15251 +       /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
15252 +       if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
15253 +           (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
15254 +               cpu_clear(cpu, nohz_cpu_mask);
15255 +               j = jiffies + 1;
15256 +       }
15257 +
15258 +       if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
15259 +               BUG();
15260 +}
15261 +
15262 +static void start_hz_timer(void)
15263 +{
15264 +       cpu_clear(smp_processor_id(), nohz_cpu_mask);
15265 +}
15266 +
15267 +void raw_safe_halt(void)
15268 +{
15269 +       stop_hz_timer();
15270 +       /* Blocking includes an implicit local_irq_enable(). */
15271 +       HYPERVISOR_block();
15272 +       start_hz_timer();
15273 +}
15274 +EXPORT_SYMBOL(raw_safe_halt);
15275 +
15276 +void halt(void)
15277 +{
15278 +       if (irqs_disabled())
15279 +               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
15280 +}
15281 +EXPORT_SYMBOL(halt);
15282 +
15283 +/* No locking required. We are only CPU running, and interrupts are off. */
15284 +void time_resume(void)
15285 +{
15286 +       init_cpu_khz();
15287 +
15288 +       get_time_values_from_xen();
15289 +
15290 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
15291 +       per_cpu(processed_system_time, 0) = processed_system_time;
15292 +       init_missing_ticks_accounting(0);
15293 +
15294 +       update_wallclock();
15295 +}
15296 +
15297 +#ifdef CONFIG_SMP
15298 +static char timer_name[NR_CPUS][15];
15299 +
15300 +int local_setup_timer(unsigned int cpu)
15301 +{
15302 +       int seq, irq;
15303 +
15304 +       BUG_ON(cpu == 0);
15305 +
15306 +       do {
15307 +               seq = read_seqbegin(&xtime_lock);
15308 +               /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
15309 +               per_cpu(processed_system_time, cpu) =
15310 +                       per_cpu(shadow_time, 0).system_timestamp;
15311 +               init_missing_ticks_accounting(cpu);
15312 +       } while (read_seqretry(&xtime_lock, seq));
15313 +
15314 +       sprintf(timer_name[cpu], "timer%d", cpu);
15315 +       irq = bind_virq_to_irqhandler(VIRQ_TIMER,
15316 +                                     cpu,
15317 +                                     timer_interrupt,
15318 +                                     SA_INTERRUPT,
15319 +                                     timer_name[cpu],
15320 +                                     NULL);
15321 +       if (irq < 0)
15322 +               return irq;
15323 +       per_cpu(timer_irq, cpu) = irq;
15324 +
15325 +       return 0;
15326 +}
15327 +
15328 +void local_teardown_timer(unsigned int cpu)
15329 +{
15330 +       BUG_ON(cpu == 0);
15331 +       unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
15332 +}
15333 +#endif
15334 +
15335 +/*
15336 + * /proc/sys/xen: This really belongs in another file. It can stay here for
15337 + * now however.
15338 + */
15339 +static ctl_table xen_subtable[] = {
15340 +       {
15341 +               .ctl_name       = 1,
15342 +               .procname       = "independent_wallclock",
15343 +               .data           = &independent_wallclock,
15344 +               .maxlen         = sizeof(independent_wallclock),
15345 +               .mode           = 0644,
15346 +               .proc_handler   = proc_dointvec
15347 +       },
15348 +       {
15349 +               .ctl_name       = 2,
15350 +               .procname       = "permitted_clock_jitter",
15351 +               .data           = &permitted_clock_jitter,
15352 +               .maxlen         = sizeof(permitted_clock_jitter),
15353 +               .mode           = 0644,
15354 +               .proc_handler   = proc_doulongvec_minmax
15355 +       },
15356 +       { 0 }
15357 +};
15358 +static ctl_table xen_table[] = {
15359 +       {
15360 +               .ctl_name       = 123,
15361 +               .procname       = "xen",
15362 +               .mode           = 0555,
15363 +               .child          = xen_subtable},
15364 +       { 0 }
15365 +};
15366 +static int __init xen_sysctl_init(void)
15367 +{
15368 +       (void)register_sysctl_table(xen_table, 0);
15369 +       return 0;
15370 +}
15371 +__initcall(xen_sysctl_init);
15372 diff -ruNp linux-2.6.19/arch/i386/kernel/traps-xen.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/traps-xen.c
15373 --- linux-2.6.19/arch/i386/kernel/traps-xen.c   1970-01-01 00:00:00.000000000 +0000
15374 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/traps-xen.c 2007-02-02 19:10:21.000000000 +0000
15375 @@ -0,0 +1,1248 @@
15376 +/*
15377 + *  linux/arch/i386/traps.c
15378 + *
15379 + *  Copyright (C) 1991, 1992  Linus Torvalds
15380 + *
15381 + *  Pentium III FXSR, SSE support
15382 + *     Gareth Hughes <gareth@valinux.com>, May 2000
15383 + */
15384 +
15385 +/*
15386 + * 'Traps.c' handles hardware traps and faults after we have saved some
15387 + * state in 'asm.s'.
15388 + */
15389 +#include <linux/sched.h>
15390 +#include <linux/kernel.h>
15391 +#include <linux/string.h>
15392 +#include <linux/errno.h>
15393 +#include <linux/timer.h>
15394 +#include <linux/mm.h>
15395 +#include <linux/init.h>
15396 +#include <linux/delay.h>
15397 +#include <linux/spinlock.h>
15398 +#include <linux/interrupt.h>
15399 +#include <linux/highmem.h>
15400 +#include <linux/kallsyms.h>
15401 +#include <linux/ptrace.h>
15402 +#include <linux/utsname.h>
15403 +#include <linux/kprobes.h>
15404 +#include <linux/kexec.h>
15405 +#include <linux/unwind.h>
15406 +#include <linux/uaccess.h>
15407 +
15408 +#ifdef CONFIG_EISA
15409 +#include <linux/ioport.h>
15410 +#include <linux/eisa.h>
15411 +#endif
15412 +
15413 +#ifdef CONFIG_MCA
15414 +#include <linux/mca.h>
15415 +#endif
15416 +
15417 +#include <asm/processor.h>
15418 +#include <asm/system.h>
15419 +#include <asm/io.h>
15420 +#include <asm/atomic.h>
15421 +#include <asm/debugreg.h>
15422 +#include <asm/desc.h>
15423 +#include <asm/i387.h>
15424 +#include <asm/nmi.h>
15425 +#include <asm/unwind.h>
15426 +#include <asm/smp.h>
15427 +#include <asm/arch_hooks.h>
15428 +#include <asm/kdebug.h>
15429 +#include <asm/stacktrace.h>
15430 +
15431 +#include <linux/module.h>
15432 +
15433 +#include "mach_traps.h"
15434 +
15435 +int panic_on_unrecovered_nmi;
15436 +
15437 +asmlinkage int system_call(void);
15438 +
15439 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
15440 +               { 0, 0 }, { 0, 0 } };
15441 +
15442 +/* Do we ignore FPU interrupts ? */
15443 +char ignore_fpu_irq = 0;
15444 +
15445 +#ifndef CONFIG_X86_NO_IDT
15446 +/*
15447 + * The IDT has to be page-aligned to simplify the Pentium
15448 + * F0 0F bug workaround.. We have a special link segment
15449 + * for this.
15450 + */
15451 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
15452 +#endif
15453 +
15454 +asmlinkage void divide_error(void);
15455 +asmlinkage void debug(void);
15456 +asmlinkage void nmi(void);
15457 +asmlinkage void int3(void);
15458 +asmlinkage void overflow(void);
15459 +asmlinkage void bounds(void);
15460 +asmlinkage void invalid_op(void);
15461 +asmlinkage void device_not_available(void);
15462 +asmlinkage void coprocessor_segment_overrun(void);
15463 +asmlinkage void invalid_TSS(void);
15464 +asmlinkage void segment_not_present(void);
15465 +asmlinkage void stack_segment(void);
15466 +asmlinkage void general_protection(void);
15467 +asmlinkage void page_fault(void);
15468 +asmlinkage void coprocessor_error(void);
15469 +asmlinkage void simd_coprocessor_error(void);
15470 +asmlinkage void alignment_check(void);
15471 +#ifndef CONFIG_XEN
15472 +asmlinkage void spurious_interrupt_bug(void);
15473 +#else
15474 +asmlinkage void fixup_4gb_segment(void);
15475 +#endif
15476 +asmlinkage void machine_check(void);
15477 +
15478 +static int kstack_depth_to_print = 24;
15479 +#ifdef CONFIG_STACK_UNWIND
15480 +static int call_trace = 1;
15481 +#else
15482 +#define call_trace (-1)
15483 +#endif
15484 +ATOMIC_NOTIFIER_HEAD(i386die_chain);
15485 +
15486 +int register_die_notifier(struct notifier_block *nb)
15487 +{
15488 +       vmalloc_sync_all();
15489 +       return atomic_notifier_chain_register(&i386die_chain, nb);
15490 +}
15491 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
15492 +
15493 +int unregister_die_notifier(struct notifier_block *nb)
15494 +{
15495 +       return atomic_notifier_chain_unregister(&i386die_chain, nb);
15496 +}
15497 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
15498 +
15499 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
15500 +{
15501 +       return  p > (void *)tinfo &&
15502 +               p < (void *)tinfo + THREAD_SIZE - 3;
15503 +}
15504 +
15505 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
15506 +                               unsigned long *stack, unsigned long ebp,
15507 +                               struct stacktrace_ops *ops, void *data)
15508 +{
15509 +       unsigned long addr;
15510 +
15511 +#ifdef CONFIG_FRAME_POINTER
15512 +       while (valid_stack_ptr(tinfo, (void *)ebp)) {
15513 +               unsigned long new_ebp;
15514 +               addr = *(unsigned long *)(ebp + 4);
15515 +               ops->address(data, addr);
15516 +               /*
15517 +                * break out of recursive entries (such as
15518 +                * end_of_stack_stop_unwind_function). Also,
15519 +                * we can never allow a frame pointer to
15520 +                * move downwards!
15521 +                */
15522 +               new_ebp = *(unsigned long *)ebp;
15523 +               if (new_ebp <= ebp)
15524 +                       break;
15525 +               ebp = new_ebp;
15526 +       }
15527 +#else
15528 +       while (valid_stack_ptr(tinfo, stack)) {
15529 +               addr = *stack++;
15530 +               if (__kernel_text_address(addr))
15531 +                       ops->address(data, addr);
15532 +       }
15533 +#endif
15534 +       return ebp;
15535 +}
15536 +
15537 +struct ops_and_data {
15538 +       struct stacktrace_ops *ops;
15539 +       void *data;
15540 +};
15541 +
15542 +static asmlinkage int
15543 +dump_trace_unwind(struct unwind_frame_info *info, void *data)
15544 +{
15545 +       struct ops_and_data *oad = (struct ops_and_data *)data;
15546 +       int n = 0;
15547 +
15548 +       while (unwind(info) == 0 && UNW_PC(info)) {
15549 +               n++;
15550 +               oad->ops->address(oad->data, UNW_PC(info));
15551 +               if (arch_unw_user_mode(info))
15552 +                       break;
15553 +       }
15554 +       return n;
15555 +}
15556 +
15557 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
15558 +               unsigned long *stack,
15559 +               struct stacktrace_ops *ops, void *data)
15560 +{
15561 +       unsigned long ebp = 0;
15562 +
15563 +       if (!task)
15564 +               task = current;
15565 +
15566 +       if (call_trace >= 0) {
15567 +               int unw_ret = 0;
15568 +               struct unwind_frame_info info;
15569 +               struct ops_and_data oad = { .ops = ops, .data = data };
15570 +
15571 +               if (regs) {
15572 +                       if (unwind_init_frame_info(&info, task, regs) == 0)
15573 +                               unw_ret = dump_trace_unwind(&info, &oad);
15574 +               } else if (task == current)
15575 +                       unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
15576 +               else {
15577 +                       if (unwind_init_blocked(&info, task) == 0)
15578 +                               unw_ret = dump_trace_unwind(&info, &oad);
15579 +               }
15580 +               if (unw_ret > 0) {
15581 +                       if (call_trace == 1 && !arch_unw_user_mode(&info)) {
15582 +                               ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
15583 +                                            UNW_PC(&info));
15584 +                               if (UNW_SP(&info) >= PAGE_OFFSET) {
15585 +                                       ops->warning(data, "Leftover inexact backtrace:\n");
15586 +                                       stack = (void *)UNW_SP(&info);
15587 +                                       if (!stack)
15588 +                                               return;
15589 +                                       ebp = UNW_FP(&info);
15590 +                               } else
15591 +                                       ops->warning(data, "Full inexact backtrace again:\n");
15592 +                       } else if (call_trace >= 1)
15593 +                               return;
15594 +                       else
15595 +                               ops->warning(data, "Full inexact backtrace again:\n");
15596 +               } else
15597 +                       ops->warning(data, "Inexact backtrace:\n");
15598 +       }
15599 +       if (!stack) {
15600 +               unsigned long dummy;
15601 +               stack = &dummy;
15602 +               if (task && task != current)
15603 +                       stack = (unsigned long *)task->thread.esp;
15604 +       }
15605 +
15606 +#ifdef CONFIG_FRAME_POINTER
15607 +       if (!ebp) {
15608 +               if (task == current) {
15609 +                       /* Grab ebp right from our regs */
15610 +                       asm ("movl %%ebp, %0" : "=r" (ebp) : );
15611 +               } else {
15612 +                       /* ebp is the last reg pushed by switch_to */
15613 +                       ebp = *(unsigned long *) task->thread.esp;
15614 +               }
15615 +       }
15616 +#endif
15617 +
15618 +       while (1) {
15619 +               struct thread_info *context;
15620 +               context = (struct thread_info *)
15621 +                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
15622 +               ebp = print_context_stack(context, stack, ebp, ops, data);
15623 +               /* Should be after the line below, but somewhere
15624 +                  in early boot context comes out corrupted and we
15625 +                  can't reference it -AK */
15626 +               if (ops->stack(data, "IRQ") < 0)
15627 +                       break;
15628 +               stack = (unsigned long*)context->previous_esp;
15629 +               if (!stack)
15630 +                       break;
15631 +       }
15632 +}
15633 +EXPORT_SYMBOL(dump_trace);
15634 +
15635 +static void
15636 +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
15637 +{
15638 +       printk(data);
15639 +       print_symbol(msg, symbol);
15640 +       printk("\n");
15641 +}
15642 +
15643 +static void print_trace_warning(void *data, char *msg)
15644 +{
15645 +       printk("%s%s\n", (char *)data, msg);
15646 +}
15647 +
15648 +static int print_trace_stack(void *data, char *name)
15649 +{
15650 +       return 0;
15651 +}
15652 +
15653 +/*
15654 + * Print one address/symbol entries per line.
15655 + */
15656 +static void print_trace_address(void *data, unsigned long addr)
15657 +{
15658 +       printk("%s [<%08lx>] ", (char *)data, addr);
15659 +       print_symbol("%s\n", addr);
15660 +}
15661 +
15662 +static struct stacktrace_ops print_trace_ops = {
15663 +       .warning = print_trace_warning,
15664 +       .warning_symbol = print_trace_warning_symbol,
15665 +       .stack = print_trace_stack,
15666 +       .address = print_trace_address,
15667 +};
15668 +
15669 +static void
15670 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
15671 +                  unsigned long * stack, char *log_lvl)
15672 +{
15673 +       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
15674 +       printk("%s =======================\n", log_lvl);
15675 +}
15676 +
15677 +void show_trace(struct task_struct *task, struct pt_regs *regs,
15678 +               unsigned long * stack)
15679 +{
15680 +       show_trace_log_lvl(task, regs, stack, "");
15681 +}
15682 +
15683 +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
15684 +                              unsigned long *esp, char *log_lvl)
15685 +{
15686 +       unsigned long *stack;
15687 +       int i;
15688 +
15689 +       if (esp == NULL) {
15690 +               if (task)
15691 +                       esp = (unsigned long*)task->thread.esp;
15692 +               else
15693 +                       esp = (unsigned long *)&esp;
15694 +       }
15695 +
15696 +       stack = esp;
15697 +       for(i = 0; i < kstack_depth_to_print; i++) {
15698 +               if (kstack_end(stack))
15699 +                       break;
15700 +               if (i && ((i % 8) == 0))
15701 +                       printk("\n%s       ", log_lvl);
15702 +               printk("%08lx ", *stack++);
15703 +       }
15704 +       printk("\n%sCall Trace:\n", log_lvl);
15705 +       show_trace_log_lvl(task, regs, esp, log_lvl);
15706 +}
15707 +
15708 +void show_stack(struct task_struct *task, unsigned long *esp)
15709 +{
15710 +       printk("       ");
15711 +       show_stack_log_lvl(task, NULL, esp, "");
15712 +}
15713 +
15714 +/*
15715 + * The architecture-independent dump_stack generator
15716 + */
15717 +void dump_stack(void)
15718 +{
15719 +       unsigned long stack;
15720 +
15721 +       show_trace(current, NULL, &stack);
15722 +}
15723 +
15724 +EXPORT_SYMBOL(dump_stack);
15725 +
15726 +void show_registers(struct pt_regs *regs)
15727 +{
15728 +       int i;
15729 +       int in_kernel = 1;
15730 +       unsigned long esp;
15731 +       unsigned short ss;
15732 +
15733 +       esp = (unsigned long) (&regs->esp);
15734 +       savesegment(ss, ss);
15735 +       if (user_mode_vm(regs)) {
15736 +               in_kernel = 0;
15737 +               esp = regs->esp;
15738 +               ss = regs->xss & 0xffff;
15739 +       }
15740 +       print_modules();
15741 +       printk(KERN_EMERG "CPU:    %d\n"
15742 +               KERN_EMERG "EIP:    %04x:[<%08lx>]    %s VLI\n"
15743 +               KERN_EMERG "EFLAGS: %08lx   (%s %.*s)\n",
15744 +               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
15745 +               print_tainted(), regs->eflags, init_utsname()->release,
15746 +               (int)strcspn(init_utsname()->version, " "),
15747 +               init_utsname()->version);
15748 +       print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
15749 +       printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
15750 +               regs->eax, regs->ebx, regs->ecx, regs->edx);
15751 +       printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
15752 +               regs->esi, regs->edi, regs->ebp, esp);
15753 +       printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
15754 +               regs->xds & 0xffff, regs->xes & 0xffff, ss);
15755 +       printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
15756 +               TASK_COMM_LEN, current->comm, current->pid,
15757 +               current_thread_info(), current, current->thread_info);
15758 +       /*
15759 +        * When in-kernel, we also print out the stack and code at the
15760 +        * time of the fault..
15761 +        */
15762 +       if (in_kernel) {
15763 +               u8 __user *eip;
15764 +               int code_bytes = 64;
15765 +               unsigned char c;
15766 +
15767 +               printk("\n" KERN_EMERG "Stack: ");
15768 +               show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
15769 +
15770 +               printk(KERN_EMERG "Code: ");
15771 +
15772 +               eip = (u8 __user *)regs->eip - 43;
15773 +               if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
15774 +                       /* try starting at EIP */
15775 +                       eip = (u8 __user *)regs->eip;
15776 +                       code_bytes = 32;
15777 +               }
15778 +               for (i = 0; i < code_bytes; i++, eip++) {
15779 +                       if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
15780 +                               printk(" Bad EIP value.");
15781 +                               break;
15782 +                       }
15783 +                       if (eip == (u8 __user *)regs->eip)
15784 +                               printk("<%02x> ", c);
15785 +                       else
15786 +                               printk("%02x ", c);
15787 +               }
15788 +       }
15789 +       printk("\n");
15790 +}      
15791 +
15792 +static void handle_BUG(struct pt_regs *regs)
15793 +{
15794 +       unsigned long eip = regs->eip;
15795 +       unsigned short ud2;
15796 +
15797 +       if (eip < PAGE_OFFSET)
15798 +               return;
15799 +       if (probe_kernel_address((unsigned short __user *)eip, ud2))
15800 +               return;
15801 +       if (ud2 != 0x0b0f)
15802 +               return;
15803 +
15804 +       printk(KERN_EMERG "------------[ cut here ]------------\n");
15805 +
15806 +#ifdef CONFIG_DEBUG_BUGVERBOSE
15807 +       do {
15808 +               unsigned short line;
15809 +               char *file;
15810 +               char c;
15811 +
15812 +               if (probe_kernel_address((unsigned short __user *)(eip + 2),
15813 +                                       line))
15814 +                       break;
15815 +               if (__get_user(file, (char * __user *)(eip + 4)) ||
15816 +                   (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
15817 +                       file = "<bad filename>";
15818 +
15819 +               printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
15820 +               return;
15821 +       } while (0);
15822 +#endif
15823 +       printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
15824 +}
15825 +
15826 +/* This is gone through when something in the kernel
15827 + * has done something bad and is about to be terminated.
15828 +*/
15829 +void die(const char * str, struct pt_regs * regs, long err)
15830 +{
15831 +       static struct {
15832 +               spinlock_t lock;
15833 +               u32 lock_owner;
15834 +               int lock_owner_depth;
15835 +       } die = {
15836 +               .lock =                 SPIN_LOCK_UNLOCKED,
15837 +               .lock_owner =           -1,
15838 +               .lock_owner_depth =     0
15839 +       };
15840 +       static int die_counter;
15841 +       unsigned long flags;
15842 +
15843 +       oops_enter();
15844 +
15845 +       if (die.lock_owner != raw_smp_processor_id()) {
15846 +               console_verbose();
15847 +               spin_lock_irqsave(&die.lock, flags);
15848 +               die.lock_owner = smp_processor_id();
15849 +               die.lock_owner_depth = 0;
15850 +               bust_spinlocks(1);
15851 +       }
15852 +       else
15853 +               local_save_flags(flags);
15854 +
15855 +       if (++die.lock_owner_depth < 3) {
15856 +               int nl = 0;
15857 +               unsigned long esp;
15858 +               unsigned short ss;
15859 +
15860 +               handle_BUG(regs);
15861 +               printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
15862 +#ifdef CONFIG_PREEMPT
15863 +               printk(KERN_EMERG "PREEMPT ");
15864 +               nl = 1;
15865 +#endif
15866 +#ifdef CONFIG_SMP
15867 +               if (!nl)
15868 +                       printk(KERN_EMERG);
15869 +               printk("SMP ");
15870 +               nl = 1;
15871 +#endif
15872 +#ifdef CONFIG_DEBUG_PAGEALLOC
15873 +               if (!nl)
15874 +                       printk(KERN_EMERG);
15875 +               printk("DEBUG_PAGEALLOC");
15876 +               nl = 1;
15877 +#endif
15878 +               if (nl)
15879 +                       printk("\n");
15880 +               if (notify_die(DIE_OOPS, str, regs, err,
15881 +                                       current->thread.trap_no, SIGSEGV) !=
15882 +                               NOTIFY_STOP) {
15883 +                       show_registers(regs);
15884 +                       /* Executive summary in case the oops scrolled away */
15885 +                       esp = (unsigned long) (&regs->esp);
15886 +                       savesegment(ss, ss);
15887 +                       if (user_mode(regs)) {
15888 +                               esp = regs->esp;
15889 +                               ss = regs->xss & 0xffff;
15890 +                       }
15891 +                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
15892 +                       print_symbol("%s", regs->eip);
15893 +                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
15894 +               }
15895 +               else
15896 +                       regs = NULL;
15897 +       } else
15898 +               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
15899 +
15900 +       bust_spinlocks(0);
15901 +       die.lock_owner = -1;
15902 +       spin_unlock_irqrestore(&die.lock, flags);
15903 +
15904 +       if (!regs)
15905 +               return;
15906 +
15907 +       if (kexec_should_crash(current))
15908 +               crash_kexec(regs);
15909 +
15910 +       if (in_interrupt())
15911 +               panic("Fatal exception in interrupt");
15912 +
15913 +       if (panic_on_oops)
15914 +               panic("Fatal exception");
15915 +
15916 +       oops_exit();
15917 +       do_exit(SIGSEGV);
15918 +}
15919 +
15920 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
15921 +{
15922 +       if (!user_mode_vm(regs))
15923 +               die(str, regs, err);
15924 +}
15925 +
15926 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
15927 +                             struct pt_regs * regs, long error_code,
15928 +                             siginfo_t *info)
15929 +{
15930 +       struct task_struct *tsk = current;
15931 +       tsk->thread.error_code = error_code;
15932 +       tsk->thread.trap_no = trapnr;
15933 +
15934 +       if (regs->eflags & VM_MASK) {
15935 +               if (vm86)
15936 +                       goto vm86_trap;
15937 +               goto trap_signal;
15938 +       }
15939 +
15940 +       if (!user_mode(regs))
15941 +               goto kernel_trap;
15942 +
15943 +       trap_signal: {
15944 +               if (info)
15945 +                       force_sig_info(signr, info, tsk);
15946 +               else
15947 +                       force_sig(signr, tsk);
15948 +               return;
15949 +       }
15950 +
15951 +       kernel_trap: {
15952 +               if (!fixup_exception(regs))
15953 +                       die(str, regs, error_code);
15954 +               return;
15955 +       }
15956 +
15957 +       vm86_trap: {
15958 +               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
15959 +               if (ret) goto trap_signal;
15960 +               return;
15961 +       }
15962 +}
15963 +
15964 +#define DO_ERROR(trapnr, signr, str, name) \
15965 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15966 +{ \
15967 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15968 +                                               == NOTIFY_STOP) \
15969 +               return; \
15970 +       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
15971 +}
15972 +
15973 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15974 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15975 +{ \
15976 +       siginfo_t info; \
15977 +       info.si_signo = signr; \
15978 +       info.si_errno = 0; \
15979 +       info.si_code = sicode; \
15980 +       info.si_addr = (void __user *)siaddr; \
15981 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15982 +                                               == NOTIFY_STOP) \
15983 +               return; \
15984 +       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
15985 +}
15986 +
15987 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
15988 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15989 +{ \
15990 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15991 +                                               == NOTIFY_STOP) \
15992 +               return; \
15993 +       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
15994 +}
15995 +
15996 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15997 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15998 +{ \
15999 +       siginfo_t info; \
16000 +       info.si_signo = signr; \
16001 +       info.si_errno = 0; \
16002 +       info.si_code = sicode; \
16003 +       info.si_addr = (void __user *)siaddr; \
16004 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16005 +                                               == NOTIFY_STOP) \
16006 +               return; \
16007 +       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16008 +}
16009 +
16010 +DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
16011 +#ifndef CONFIG_KPROBES
16012 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
16013 +#endif
16014 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
16015 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
16016 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
16017 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
16018 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16019 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
16020 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
16021 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
16022 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
16023 +
16024 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
16025 +                                             long error_code)
16026 +{
16027 +       current->thread.error_code = error_code;
16028 +       current->thread.trap_no = 13;
16029 +
16030 +       if (regs->eflags & VM_MASK)
16031 +               goto gp_in_vm86;
16032 +
16033 +       if (!user_mode(regs))
16034 +               goto gp_in_kernel;
16035 +
16036 +       current->thread.error_code = error_code;
16037 +       current->thread.trap_no = 13;
16038 +       force_sig(SIGSEGV, current);
16039 +       return;
16040 +
16041 +gp_in_vm86:
16042 +       local_irq_enable();
16043 +       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
16044 +       return;
16045 +
16046 +gp_in_kernel:
16047 +       if (!fixup_exception(regs)) {
16048 +               if (notify_die(DIE_GPF, "general protection fault", regs,
16049 +                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
16050 +                       return;
16051 +               die("general protection fault", regs, error_code);
16052 +       }
16053 +}
16054 +
16055 +static __kprobes void
16056 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
16057 +{
16058 +       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
16059 +               "CPU %d.\n", reason, smp_processor_id());
16060 +       printk(KERN_EMERG "You probably have a hardware problem with your RAM "
16061 +                       "chips\n");
16062 +       if (panic_on_unrecovered_nmi)
16063 +                panic("NMI: Not continuing");
16064 +
16065 +       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
16066 +
16067 +       /* Clear and disable the memory parity error line. */
16068 +       clear_mem_error(reason);
16069 +}
16070 +
16071 +static __kprobes void
16072 +io_check_error(unsigned char reason, struct pt_regs * regs)
16073 +{
16074 +       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
16075 +       show_registers(regs);
16076 +
16077 +       /* Re-enable the IOCK line, wait for a few seconds */
16078 +       clear_io_check_error(reason);
16079 +}
16080 +
16081 +static __kprobes void
16082 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
16083 +{
16084 +#ifdef CONFIG_MCA
16085 +       /* Might actually be able to figure out what the guilty party
16086 +       * is. */
16087 +       if( MCA_bus ) {
16088 +               mca_handle_nmi();
16089 +               return;
16090 +       }
16091 +#endif
16092 +       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
16093 +               "CPU %d.\n", reason, smp_processor_id());
16094 +       printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
16095 +       if (panic_on_unrecovered_nmi)
16096 +                panic("NMI: Not continuing");
16097 +
16098 +       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
16099 +}
16100 +
16101 +static DEFINE_SPINLOCK(nmi_print_lock);
16102 +
16103 +void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16104 +{
16105 +       if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
16106 +           NOTIFY_STOP)
16107 +               return;
16108 +
16109 +       spin_lock(&nmi_print_lock);
16110 +       /*
16111 +       * We are in trouble anyway, lets at least try
16112 +       * to get a message out.
16113 +       */
16114 +       bust_spinlocks(1);
16115 +       printk(KERN_EMERG "%s", msg);
16116 +       printk(" on CPU%d, eip %08lx, registers:\n",
16117 +               smp_processor_id(), regs->eip);
16118 +       show_registers(regs);
16119 +       printk(KERN_EMERG "console shuts up ...\n");
16120 +       console_silent();
16121 +       spin_unlock(&nmi_print_lock);
16122 +       bust_spinlocks(0);
16123 +
16124 +       /* If we are in kernel we are probably nested up pretty bad
16125 +        * and might aswell get out now while we still can.
16126 +       */
16127 +       if (!user_mode_vm(regs)) {
16128 +               current->thread.trap_no = 2;
16129 +               crash_kexec(regs);
16130 +       }
16131 +
16132 +       do_exit(SIGSEGV);
16133 +}
16134 +
16135 +static __kprobes void default_do_nmi(struct pt_regs * regs)
16136 +{
16137 +       unsigned char reason = 0;
16138 +
16139 +       /* Only the BSP gets external NMIs from the system.  */
16140 +       if (!smp_processor_id())
16141 +               reason = get_nmi_reason();
16142
16143 +       if (!(reason & 0xc0)) {
16144 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16145 +                                                       == NOTIFY_STOP)
16146 +                       return;
16147 +#ifdef CONFIG_X86_LOCAL_APIC
16148 +               /*
16149 +                * Ok, so this is none of the documented NMI sources,
16150 +                * so it must be the NMI watchdog.
16151 +                */
16152 +               if (nmi_watchdog_tick(regs, reason))
16153 +                       return;
16154 +               if (!do_nmi_callback(regs, smp_processor_id()))
16155 +#endif
16156 +                       unknown_nmi_error(reason, regs);
16157 +
16158 +               return;
16159 +       }
16160 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16161 +               return;
16162 +       if (reason & 0x80)
16163 +               mem_parity_error(reason, regs);
16164 +       if (reason & 0x40)
16165 +               io_check_error(reason, regs);
16166 +       /*
16167 +        * Reassert NMI in case it became active meanwhile
16168 +        * as it's edge-triggered.
16169 +        */
16170 +       reassert_nmi();
16171 +}
16172 +
16173 +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
16174 +{
16175 +       int cpu;
16176 +
16177 +       nmi_enter();
16178 +
16179 +       cpu = smp_processor_id();
16180 +
16181 +       ++nmi_count(cpu);
16182 +
16183 +       default_do_nmi(regs);
16184 +
16185 +       nmi_exit();
16186 +}
16187 +
16188 +#ifdef CONFIG_KPROBES
16189 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
16190 +{
16191 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
16192 +                       == NOTIFY_STOP)
16193 +               return;
16194 +       /* This is an interrupt gate, because kprobes wants interrupts
16195 +       disabled.  Normal trap handlers don't. */
16196 +       restore_interrupts(regs);
16197 +       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
16198 +}
16199 +#endif
16200 +
16201 +/*
16202 + * Our handling of the processor debug registers is non-trivial.
16203 + * We do not clear them on entry and exit from the kernel. Therefore
16204 + * it is possible to get a watchpoint trap here from inside the kernel.
16205 + * However, the code in ./ptrace.c has ensured that the user can
16206 + * only set watchpoints on userspace addresses. Therefore the in-kernel
16207 + * watchpoint trap can only occur in code which is reading/writing
16208 + * from user space. Such code must not hold kernel locks (since it
16209 + * can equally take a page fault), therefore it is safe to call
16210 + * force_sig_info even though that claims and releases locks.
16211 + * 
16212 + * Code in ./signal.c ensures that the debug control register
16213 + * is restored before we deliver any signal, and therefore that
16214 + * user code runs with the correct debug control register even though
16215 + * we clear it here.
16216 + *
16217 + * Being careful here means that we don't have to be as careful in a
16218 + * lot of more complicated places (task switching can be a bit lazy
16219 + * about restoring all the debug state, and ptrace doesn't have to
16220 + * find every occurrence of the TF bit that could be saved away even
16221 + * by user code)
16222 + */
16223 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
16224 +{
16225 +       unsigned int condition;
16226 +       struct task_struct *tsk = current;
16227 +
16228 +       get_debugreg(condition, 6);
16229 +
16230 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16231 +                                       SIGTRAP) == NOTIFY_STOP)
16232 +               return;
16233 +       /* It's safe to allow irq's after DR6 has been saved */
16234 +       if (regs->eflags & X86_EFLAGS_IF)
16235 +               local_irq_enable();
16236 +
16237 +       /* Mask out spurious debug traps due to lazy DR7 setting */
16238 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
16239 +               if (!tsk->thread.debugreg[7])
16240 +                       goto clear_dr7;
16241 +       }
16242 +
16243 +       if (regs->eflags & VM_MASK)
16244 +               goto debug_vm86;
16245 +
16246 +       /* Save debug status register where ptrace can see it */
16247 +       tsk->thread.debugreg[6] = condition;
16248 +
16249 +       /*
16250 +        * Single-stepping through TF: make sure we ignore any events in
16251 +        * kernel space (but re-enable TF when returning to user mode).
16252 +        */
16253 +       if (condition & DR_STEP) {
16254 +               /*
16255 +                * We already checked v86 mode above, so we can
16256 +                * check for kernel mode by just checking the CPL
16257 +                * of CS.
16258 +                */
16259 +               if (!user_mode(regs))
16260 +                       goto clear_TF_reenable;
16261 +       }
16262 +
16263 +       /* Ok, finally something we can handle */
16264 +       send_sigtrap(tsk, regs, error_code);
16265 +
16266 +       /* Disable additional traps. They'll be re-enabled when
16267 +        * the signal is delivered.
16268 +        */
16269 +clear_dr7:
16270 +       set_debugreg(0, 7);
16271 +       return;
16272 +
16273 +debug_vm86:
16274 +       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
16275 +       return;
16276 +
16277 +clear_TF_reenable:
16278 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
16279 +       regs->eflags &= ~TF_MASK;
16280 +       return;
16281 +}
16282 +
16283 +/*
16284 + * Note that we play around with the 'TS' bit in an attempt to get
16285 + * the correct behaviour even in the presence of the asynchronous
16286 + * IRQ13 behaviour
16287 + */
16288 +void math_error(void __user *eip)
16289 +{
16290 +       struct task_struct * task;
16291 +       siginfo_t info;
16292 +       unsigned short cwd, swd;
16293 +
16294 +       /*
16295 +        * Save the info for the exception handler and clear the error.
16296 +        */
16297 +       task = current;
16298 +       save_init_fpu(task);
16299 +       task->thread.trap_no = 16;
16300 +       task->thread.error_code = 0;
16301 +       info.si_signo = SIGFPE;
16302 +       info.si_errno = 0;
16303 +       info.si_code = __SI_FAULT;
16304 +       info.si_addr = eip;
16305 +       /*
16306 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
16307 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
16308 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
16309 +        * fault bit.  We should only be taking one exception at a time,
16310 +        * so if this combination doesn't produce any single exception,
16311 +        * then we have a bad program that isn't syncronizing its FPU usage
16312 +        * and it will suffer the consequences since we won't be able to
16313 +        * fully reproduce the context of the exception
16314 +        */
16315 +       cwd = get_fpu_cwd(task);
16316 +       swd = get_fpu_swd(task);
16317 +       switch (swd & ~cwd & 0x3f) {
16318 +               case 0x000: /* No unmasked exception */
16319 +                       return;
16320 +               default:    /* Multiple exceptions */
16321 +                       break;
16322 +               case 0x001: /* Invalid Op */
16323 +                       /*
16324 +                        * swd & 0x240 == 0x040: Stack Underflow
16325 +                        * swd & 0x240 == 0x240: Stack Overflow
16326 +                        * User must clear the SF bit (0x40) if set
16327 +                        */
16328 +                       info.si_code = FPE_FLTINV;
16329 +                       break;
16330 +               case 0x002: /* Denormalize */
16331 +               case 0x010: /* Underflow */
16332 +                       info.si_code = FPE_FLTUND;
16333 +                       break;
16334 +               case 0x004: /* Zero Divide */
16335 +                       info.si_code = FPE_FLTDIV;
16336 +                       break;
16337 +               case 0x008: /* Overflow */
16338 +                       info.si_code = FPE_FLTOVF;
16339 +                       break;
16340 +               case 0x020: /* Precision */
16341 +                       info.si_code = FPE_FLTRES;
16342 +                       break;
16343 +       }
16344 +       force_sig_info(SIGFPE, &info, task);
16345 +}
16346 +
16347 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
16348 +{
16349 +       ignore_fpu_irq = 1;
16350 +       math_error((void __user *)regs->eip);
16351 +}
16352 +
16353 +static void simd_math_error(void __user *eip)
16354 +{
16355 +       struct task_struct * task;
16356 +       siginfo_t info;
16357 +       unsigned short mxcsr;
16358 +
16359 +       /*
16360 +        * Save the info for the exception handler and clear the error.
16361 +        */
16362 +       task = current;
16363 +       save_init_fpu(task);
16364 +       task->thread.trap_no = 19;
16365 +       task->thread.error_code = 0;
16366 +       info.si_signo = SIGFPE;
16367 +       info.si_errno = 0;
16368 +       info.si_code = __SI_FAULT;
16369 +       info.si_addr = eip;
16370 +       /*
16371 +        * The SIMD FPU exceptions are handled a little differently, as there
16372 +        * is only a single status/control register.  Thus, to determine which
16373 +        * unmasked exception was caught we must mask the exception mask bits
16374 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
16375 +        */
16376 +       mxcsr = get_fpu_mxcsr(task);
16377 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
16378 +               case 0x000:
16379 +               default:
16380 +                       break;
16381 +               case 0x001: /* Invalid Op */
16382 +                       info.si_code = FPE_FLTINV;
16383 +                       break;
16384 +               case 0x002: /* Denormalize */
16385 +               case 0x010: /* Underflow */
16386 +                       info.si_code = FPE_FLTUND;
16387 +                       break;
16388 +               case 0x004: /* Zero Divide */
16389 +                       info.si_code = FPE_FLTDIV;
16390 +                       break;
16391 +               case 0x008: /* Overflow */
16392 +                       info.si_code = FPE_FLTOVF;
16393 +                       break;
16394 +               case 0x020: /* Precision */
16395 +                       info.si_code = FPE_FLTRES;
16396 +                       break;
16397 +       }
16398 +       force_sig_info(SIGFPE, &info, task);
16399 +}
16400 +
16401 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
16402 +                                         long error_code)
16403 +{
16404 +       if (cpu_has_xmm) {
16405 +               /* Handle SIMD FPU exceptions on PIII+ processors. */
16406 +               ignore_fpu_irq = 1;
16407 +               simd_math_error((void __user *)regs->eip);
16408 +       } else {
16409 +               /*
16410 +                * Handle strange cache flush from user space exception
16411 +                * in all other cases.  This is undocumented behaviour.
16412 +                */
16413 +               if (regs->eflags & VM_MASK) {
16414 +                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
16415 +                                         error_code);
16416 +                       return;
16417 +               }
16418 +               current->thread.trap_no = 19;
16419 +               current->thread.error_code = error_code;
16420 +               die_if_kernel("cache flush denied", regs, error_code);
16421 +               force_sig(SIGSEGV, current);
16422 +       }
16423 +}
16424 +
16425 +#ifndef CONFIG_XEN
16426 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
16427 +                                         long error_code)
16428 +{
16429 +#if 0
16430 +       /* No need to warn about this any longer. */
16431 +       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
16432 +#endif
16433 +}
16434 +
16435 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
16436 +{
16437 +       unsigned long *switch16_ptr, *switch32_ptr;
16438 +       struct pt_regs *regs;
16439 +       unsigned long stack_top, stack_bot;
16440 +       unsigned short iret_frame16_off;
16441 +       int cpu = smp_processor_id();
16442 +       /* reserve the space on 32bit stack for the magic switch16 pointer */
16443 +       memmove(stk, stk + 8, sizeof(struct pt_regs));
16444 +       switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
16445 +       regs = (struct pt_regs *)stk;
16446 +       /* now the switch32 on 16bit stack */
16447 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
16448 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
16449 +       switch32_ptr = (unsigned long *)(stack_top - 8);
16450 +       iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
16451 +       /* copy iret frame on 16bit stack */
16452 +       memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
16453 +       /* fill in the switch pointers */
16454 +       switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
16455 +       switch16_ptr[1] = __ESPFIX_SS;
16456 +       switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
16457 +               8 - CPU_16BIT_STACK_SIZE;
16458 +       switch32_ptr[1] = __KERNEL_DS;
16459 +}
16460 +
16461 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
16462 +{
16463 +       unsigned long *switch32_ptr;
16464 +       unsigned char *stack16, *stack32;
16465 +       unsigned long stack_top, stack_bot;
16466 +       int len;
16467 +       int cpu = smp_processor_id();
16468 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
16469 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
16470 +       switch32_ptr = (unsigned long *)(stack_top - 8);
16471 +       /* copy the data from 16bit stack to 32bit stack */
16472 +       len = CPU_16BIT_STACK_SIZE - 8 - sp;
16473 +       stack16 = (unsigned char *)(stack_bot + sp);
16474 +       stack32 = (unsigned char *)
16475 +               (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
16476 +       memcpy(stack32, stack16, len);
16477 +       return stack32;
16478 +}
16479 +#endif
16480 +
16481 +/*
16482 + *  'math_state_restore()' saves the current math information in the
16483 + * old math state array, and gets the new ones from the current task
16484 + *
16485 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
16486 + * Don't touch unless you *really* know how it works.
16487 + *
16488 + * Must be called with kernel preemption disabled (in this case,
16489 + * local interrupts are disabled at the call-site in entry.S).
16490 + */
16491 +asmlinkage void math_state_restore(struct pt_regs regs)
16492 +{
16493 +       struct thread_info *thread = current_thread_info();
16494 +       struct task_struct *tsk = thread->task;
16495 +
16496 +       /* NB. 'clts' is done for us by Xen during virtual trap. */
16497 +       if (!tsk_used_math(tsk))
16498 +               init_fpu(tsk);
16499 +       restore_fpu(tsk);
16500 +       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
16501 +}
16502 +
16503 +#ifndef CONFIG_MATH_EMULATION
16504 +
16505 +asmlinkage void math_emulate(long arg)
16506 +{
16507 +       printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
16508 +       printk(KERN_EMERG "killing %s.\n",current->comm);
16509 +       force_sig(SIGFPE,current);
16510 +       schedule();
16511 +}
16512 +
16513 +#endif /* CONFIG_MATH_EMULATION */
16514 +
16515 +#ifdef CONFIG_X86_F00F_BUG
16516 +void __init trap_init_f00f_bug(void)
16517 +{
16518 +       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
16519 +
16520 +       /*
16521 +        * Update the IDT descriptor and reload the IDT so that
16522 +        * it uses the read-only mapped virtual address.
16523 +        */
16524 +       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
16525 +       load_idt(&idt_descr);
16526 +}
16527 +#endif
16528 +
16529 +
16530 +/*
16531 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
16532 + * for those that specify <dpl>|4 in the second field.
16533 + */
16534 +static trap_info_t trap_table[] = {
16535 +       {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
16536 +       {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
16537 +       {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
16538 +       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },
16539 +       {  5, 0, __KERNEL_CS, (unsigned long)bounds                     },
16540 +       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                 },
16541 +       {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available     },
16542 +       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
16543 +       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                },
16544 +       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present        },
16545 +       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment              },
16546 +       { 13, 0, __KERNEL_CS, (unsigned long)general_protection         },
16547 +       { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault               },
16548 +       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
16549 +       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error          },
16550 +       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check            },
16551 +#ifdef CONFIG_X86_MCE
16552 +       { 18, 0, __KERNEL_CS, (unsigned long)machine_check              },
16553 +#endif
16554 +       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
16555 +       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
16556 +       {  0, 0,           0, 0                                         }
16557 +};
16558 +
16559 +void __init trap_init(void)
16560 +{
16561 +       HYPERVISOR_set_trap_table(trap_table);
16562 +
16563 +       if (cpu_has_fxsr) {
16564 +               /*
16565 +                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
16566 +                * Generates a compile-time "error: zero width for bit-field" if
16567 +                * the alignment is wrong.
16568 +                */
16569 +               struct fxsrAlignAssert {
16570 +                       int _:!(offsetof(struct task_struct,
16571 +                                       thread.i387.fxsave) & 15);
16572 +               };
16573 +
16574 +               printk(KERN_INFO "Enabling fast FPU save and restore... ");
16575 +               set_in_cr4(X86_CR4_OSFXSR);
16576 +               printk("done.\n");
16577 +       }
16578 +       if (cpu_has_xmm) {
16579 +               printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
16580 +                               "support... ");
16581 +               set_in_cr4(X86_CR4_OSXMMEXCPT);
16582 +               printk("done.\n");
16583 +       }
16584 +
16585 +       /*
16586 +        * Should be a barrier for any external CPU state.
16587 +        */
16588 +       cpu_init();
16589 +}
16590 +
16591 +void smp_trap_init(trap_info_t *trap_ctxt)
16592 +{
16593 +       trap_info_t *t = trap_table;
16594 +
16595 +       for (t = trap_table; t->address; t++) {
16596 +               trap_ctxt[t->vector].flags = t->flags;
16597 +               trap_ctxt[t->vector].cs = t->cs;
16598 +               trap_ctxt[t->vector].address = t->address;
16599 +       }
16600 +}
16601 +
16602 +static int __init kstack_setup(char *s)
16603 +{
16604 +       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
16605 +       return 1;
16606 +}
16607 +__setup("kstack=", kstack_setup);
16608 +
16609 +#ifdef CONFIG_STACK_UNWIND
16610 +static int __init call_trace_setup(char *s)
16611 +{
16612 +       if (strcmp(s, "old") == 0)
16613 +               call_trace = -1;
16614 +       else if (strcmp(s, "both") == 0)
16615 +               call_trace = 0;
16616 +       else if (strcmp(s, "newfallback") == 0)
16617 +               call_trace = 1;
16618 +       else if (strcmp(s, "new") == 2)
16619 +               call_trace = 2;
16620 +       return 1;
16621 +}
16622 +__setup("call_trace=", call_trace_setup);
16623 +#endif
16624 diff -ruNp linux-2.6.19/arch/i386/kernel/traps.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/traps.c
16625 --- linux-2.6.19/arch/i386/kernel/traps.c       2006-11-29 21:57:37.000000000 +0000
16626 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/traps.c     2007-02-02 19:10:21.000000000 +0000
16627 @@ -721,18 +721,11 @@ mem_parity_error(unsigned char reason, s
16628  static __kprobes void
16629  io_check_error(unsigned char reason, struct pt_regs * regs)
16630  {
16631 -       unsigned long i;
16632 -
16633         printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
16634         show_registers(regs);
16635  
16636         /* Re-enable the IOCK line, wait for a few seconds */
16637 -       reason = (reason & 0xf) | 8;
16638 -       outb(reason, 0x61);
16639 -       i = 2000;
16640 -       while (--i) udelay(1000);
16641 -       reason &= ~8;
16642 -       outb(reason, 0x61);
16643 +       clear_io_check_error(reason);
16644  }
16645  
16646  static __kprobes void
16647 diff -ruNp linux-2.6.19/arch/i386/kernel/tsc.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/tsc.c
16648 --- linux-2.6.19/arch/i386/kernel/tsc.c 2006-11-29 21:57:37.000000000 +0000
16649 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/tsc.c       2007-02-02 19:10:21.000000000 +0000
16650 @@ -101,6 +101,7 @@ static inline unsigned long long cycles_
16651         return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
16652  }
16653  
16654 +#ifndef CONFIG_XEN
16655  /*
16656   * Scheduler clock - returns current time in nanosec units.
16657   */
16658 @@ -124,6 +125,7 @@ unsigned long long sched_clock(void)
16659         /* return the value in ns */
16660         return cycles_2_ns(this_offset);
16661  }
16662 +#endif
16663  
16664  static unsigned long calculate_cpu_khz(void)
16665  {
16666 diff -ruNp linux-2.6.19/arch/i386/kernel/vm86.c linux-2.6.19-xen-3.0.4/arch/i386/kernel/vm86.c
16667 --- linux-2.6.19/arch/i386/kernel/vm86.c        2006-11-29 21:57:37.000000000 +0000
16668 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/vm86.c      2007-02-02 19:10:21.000000000 +0000
16669 @@ -97,7 +97,9 @@
16670  struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
16671  struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
16672  {
16673 +#ifndef CONFIG_X86_NO_TSS
16674         struct tss_struct *tss;
16675 +#endif
16676         struct pt_regs *ret;
16677         unsigned long tmp;
16678  
16679 @@ -122,12 +124,16 @@ struct pt_regs * fastcall save_v86_state
16680                 do_exit(SIGSEGV);
16681         }
16682  
16683 +#ifndef CONFIG_X86_NO_TSS
16684         tss = &per_cpu(init_tss, get_cpu());
16685 +#endif
16686         current->thread.esp0 = current->thread.saved_esp0;
16687         current->thread.sysenter_cs = __KERNEL_CS;
16688         load_esp0(tss, &current->thread);
16689         current->thread.saved_esp0 = 0;
16690 +#ifndef CONFIG_X86_NO_TSS
16691         put_cpu();
16692 +#endif
16693  
16694         loadsegment(fs, current->thread.saved_fs);
16695         loadsegment(gs, current->thread.saved_gs);
16696 @@ -251,7 +257,9 @@ out:
16697  
16698  static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
16699  {
16700 +#ifndef CONFIG_X86_NO_TSS
16701         struct tss_struct *tss;
16702 +#endif
16703         long eax;
16704  /*
16705   * make sure the vm86() system call doesn't try to do anything silly
16706 @@ -296,12 +304,16 @@ static void do_sys_vm86(struct kernel_vm
16707         savesegment(fs, tsk->thread.saved_fs);
16708         savesegment(gs, tsk->thread.saved_gs);
16709  
16710 +#ifndef CONFIG_X86_NO_TSS
16711         tss = &per_cpu(init_tss, get_cpu());
16712 +#endif
16713         tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
16714         if (cpu_has_sep)
16715                 tsk->thread.sysenter_cs = 0;
16716         load_esp0(tss, &tsk->thread);
16717 +#ifndef CONFIG_X86_NO_TSS
16718         put_cpu();
16719 +#endif
16720  
16721         tsk->thread.screen_bitmap = info->screen_bitmap;
16722         if (info->flags & VM86_SCREEN_BITMAP)
16723 diff -ruNp linux-2.6.19/arch/i386/kernel/vsyscall-note-xen.S linux-2.6.19-xen-3.0.4/arch/i386/kernel/vsyscall-note-xen.S
16724 --- linux-2.6.19/arch/i386/kernel/vsyscall-note-xen.S   1970-01-01 00:00:00.000000000 +0000
16725 +++ linux-2.6.19-xen-3.0.4/arch/i386/kernel/vsyscall-note-xen.S 2007-02-02 19:10:21.000000000 +0000
16726 @@ -0,0 +1,32 @@
16727 +/*
16728 + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
16729 + * Here we can supply some information useful to userland.
16730 + * First we get the vanilla i386 note that supplies the kernel version info.
16731 + */
16732 +
16733 +#include "vsyscall-note.S"
16734 +
16735 +/*
16736 + * Now we add a special note telling glibc's dynamic linker a fake hardware
16737 + * flavor that it will use to choose the search path for libraries in the
16738 + * same way it uses real hardware capabilities like "mmx".
16739 + * We supply "nosegneg" as the fake capability, to indicate that we
16740 + * do not like negative offsets in instructions using segment overrides,
16741 + * since we implement those inefficiently.  This makes it possible to
16742 + * install libraries optimized to avoid those access patterns in someplace
16743 + * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
16744 + * corresponding to the bits here is needed to make ldconfig work right.
16745 + * It should contain:
16746 + *     hwcap 0 nosegneg
16747 + * to match the mapping of bit to name that we give here.
16748 + */
16749 +#define NOTE_KERNELCAP_BEGIN(ncaps, mask) \
16750 +       ASM_ELF_NOTE_BEGIN(".note.kernelcap", "a", "GNU", 2) \
16751 +       .long ncaps, mask
16752 +#define NOTE_KERNELCAP(bit, name) \
16753 +       .byte bit; .asciz name
16754 +#define NOTE_KERNELCAP_END ASM_ELF_NOTE_END
16755 +
16756 +NOTE_KERNELCAP_BEGIN(1, 1)
16757 +NOTE_KERNELCAP(1, "nosegneg")  /* Change 1 back to 0 when glibc is fixed! */
16758 +NOTE_KERNELCAP_END
16759 diff -ruNp linux-2.6.19/arch/i386/lib/delay.c linux-2.6.19-xen-3.0.4/arch/i386/lib/delay.c
16760 --- linux-2.6.19/arch/i386/lib/delay.c  2006-11-29 21:57:37.000000000 +0000
16761 +++ linux-2.6.19-xen-3.0.4/arch/i386/lib/delay.c        2007-02-02 19:10:21.000000000 +0000
16762 @@ -60,6 +60,7 @@ void use_tsc_delay(void)
16763         delay_fn = delay_tsc;
16764  }
16765  
16766 +#ifndef CONFIG_X86_XEN
16767  int read_current_timer(unsigned long *timer_val)
16768  {
16769         if (delay_fn == delay_tsc) {
16770 @@ -68,7 +69,7 @@ int read_current_timer(unsigned long *ti
16771         }
16772         return -1;
16773  }
16774 -
16775 +#endif
16776  void __delay(unsigned long loops)
16777  {
16778         delay_fn(loops);
16779 diff -ruNp linux-2.6.19/arch/i386/mach-xen/Makefile linux-2.6.19-xen-3.0.4/arch/i386/mach-xen/Makefile
16780 --- linux-2.6.19/arch/i386/mach-xen/Makefile    1970-01-01 00:00:00.000000000 +0000
16781 +++ linux-2.6.19-xen-3.0.4/arch/i386/mach-xen/Makefile  2007-02-02 19:10:21.000000000 +0000
16782 @@ -0,0 +1,5 @@
16783 +#
16784 +# Makefile for the linux kernel.
16785 +#
16786 +
16787 +obj-y                          := setup.o irqflags.o
16788 diff -ruNp linux-2.6.19/arch/i386/mach-xen/irqflags.c linux-2.6.19-xen-3.0.4/arch/i386/mach-xen/irqflags.c
16789 --- linux-2.6.19/arch/i386/mach-xen/irqflags.c  1970-01-01 00:00:00.000000000 +0000
16790 +++ linux-2.6.19-xen-3.0.4/arch/i386/mach-xen/irqflags.c        2007-02-02 19:10:21.000000000 +0000
16791 @@ -0,0 +1,99 @@
16792 +#include <linux/module.h>
16793 +#include <linux/smp.h>
16794 +#include <asm/irqflags.h>
16795 +#include <asm/hypervisor.h>
16796 +
16797 +/* interrupt control.. */
16798 +
16799 +/* 
16800 + * The use of 'barrier' in the following reflects their use as local-lock
16801 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
16802 + * critical operations are executed. All critical operations must complete
16803 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
16804 + * includes these barriers, for example.
16805 + */
16806 +
16807 +unsigned long __raw_local_save_flags(void)
16808 +{
16809 +       struct vcpu_info *_vcpu;
16810 +       unsigned long flags;
16811 +
16812 +       preempt_disable();
16813 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
16814 +       flags = _vcpu->evtchn_upcall_mask;
16815 +       preempt_enable();
16816 +
16817 +       return flags;
16818 +}
16819 +EXPORT_SYMBOL(__raw_local_save_flags);
16820 +
16821 +void raw_local_irq_restore(unsigned long flags)
16822 +{
16823 +       struct vcpu_info *_vcpu;
16824 +
16825 +       preempt_disable();
16826 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
16827 +       if ((_vcpu->evtchn_upcall_mask = flags) == 0) {
16828 +               barrier(); /* unmask then check (avoid races) */
16829 +               if (unlikely(_vcpu->evtchn_upcall_pending))
16830 +                       force_evtchn_callback();
16831 +               preempt_enable();
16832 +       } else
16833 +               preempt_enable_no_resched();
16834 +
16835 +}
16836 +EXPORT_SYMBOL(raw_local_irq_restore);
16837 +
16838 +void raw_local_irq_disable(void)
16839 +{
16840 +       struct vcpu_info *_vcpu;
16841 +
16842 +       preempt_disable();
16843 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
16844 +       _vcpu->evtchn_upcall_mask = 1;
16845 +       preempt_enable_no_resched();
16846 +}
16847 +EXPORT_SYMBOL(raw_local_irq_disable);
16848 +
16849 +void raw_local_irq_enable(void)
16850 +{
16851 +       struct vcpu_info *_vcpu;
16852 +
16853 +       preempt_disable();
16854 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
16855 +       _vcpu->evtchn_upcall_mask = 0;
16856 +       barrier(); /* unmask then check (avoid races) */
16857 +       if (unlikely(_vcpu->evtchn_upcall_pending))
16858 +               force_evtchn_callback();
16859 +       preempt_enable();
16860 +}
16861 +EXPORT_SYMBOL(raw_local_irq_enable);
16862 +
16863 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
16864 +int raw_irqs_disabled(void)
16865 +{
16866 +       struct vcpu_info *_vcpu;
16867 +       int disabled;
16868 +
16869 +       preempt_disable();
16870 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
16871 +       disabled = (_vcpu->evtchn_upcall_mask != 0);
16872 +       preempt_enable_no_resched();
16873 +       return disabled;
16874 +}
16875 +EXPORT_SYMBOL(raw_irqs_disabled);
16876 +
16877 +unsigned long __raw_local_irq_save(void)
16878 +{
16879 +       struct vcpu_info *_vcpu;
16880 +       unsigned long flags;
16881 +
16882 +       preempt_disable();
16883 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
16884 +       flags = _vcpu->evtchn_upcall_mask;
16885 +       _vcpu->evtchn_upcall_mask = 1;
16886 +       preempt_enable_no_resched();
16887 +
16888 +       return flags;
16889 +}
16890 +EXPORT_SYMBOL(__raw_local_irq_save);
16891 diff -ruNp linux-2.6.19/arch/i386/mach-xen/setup.c linux-2.6.19-xen-3.0.4/arch/i386/mach-xen/setup.c
16892 --- linux-2.6.19/arch/i386/mach-xen/setup.c     1970-01-01 00:00:00.000000000 +0000
16893 +++ linux-2.6.19-xen-3.0.4/arch/i386/mach-xen/setup.c   2007-02-02 19:10:21.000000000 +0000
16894 @@ -0,0 +1,169 @@
16895 +/*
16896 + *     Machine specific setup for generic
16897 + */
16898 +
16899 +#include <linux/smp.h>
16900 +#include <linux/init.h>
16901 +#include <linux/interrupt.h>
16902 +#include <linux/module.h>
16903 +#include <asm/acpi.h>
16904 +#include <asm/arch_hooks.h>
16905 +#include <asm/e820.h>
16906 +#include <asm/setup.h>
16907 +#include <asm/fixmap.h>
16908 +
16909 +#include <xen/interface/callback.h>
16910 +#include <xen/interface/memory.h>
16911 +
16912 +#ifdef CONFIG_HOTPLUG_CPU
16913 +#define DEFAULT_SEND_IPI       (1)
16914 +#else
16915 +#define DEFAULT_SEND_IPI       (0)
16916 +#endif
16917 +
16918 +int no_broadcast=DEFAULT_SEND_IPI;
16919 +
16920 +static __init int no_ipi_broadcast(char *str)
16921 +{
16922 +       get_option(&str, &no_broadcast);
16923 +       printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
16924 +                                                                                       "IPI Broadcast");
16925 +       return 1;
16926 +}
16927 +
16928 +__setup("no_ipi_broadcast", no_ipi_broadcast);
16929 +
16930 +static int __init print_ipi_mode(void)
16931 +{
16932 +       printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
16933 +                                                                                       "Shortcut");
16934 +       return 0;
16935 +}
16936 +
16937 +late_initcall(print_ipi_mode);
16938 +
16939 +/**
16940 + * machine_specific_memory_setup - Hook for machine specific memory setup.
16941 + *
16942 + * Description:
16943 + *     This is included late in kernel/setup.c so that it can make
16944 + *     use of all of the static functions.
16945 + **/
16946 +
16947 +char * __init machine_specific_memory_setup(void)
16948 +{
16949 +       int rc;
16950 +       struct xen_memory_map memmap;
16951 +       /*
16952 +        * This is rather large for a stack variable but this early in
16953 +        * the boot process we know we have plenty slack space.
16954 +        */
16955 +       struct e820entry map[E820MAX];
16956 +
16957 +       memmap.nr_entries = E820MAX;
16958 +       set_xen_guest_handle(memmap.buffer, map);
16959 +
16960 +       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
16961 +       if ( rc == -ENOSYS ) {
16962 +               memmap.nr_entries = 1;
16963 +               map[0].addr = 0ULL;
16964 +               map[0].size = PFN_PHYS(xen_start_info->nr_pages);
16965 +               /* 8MB slack (to balance backend allocations). */
16966 +               map[0].size += 8ULL << 20;
16967 +               map[0].type = E820_RAM;
16968 +               rc = 0;
16969 +       }
16970 +       BUG_ON(rc);
16971 +
16972 +       sanitize_e820_map(map, (char *)&memmap.nr_entries);
16973 +
16974 +       BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
16975 +
16976 +       return "Xen";
16977 +}
16978 +
16979 +extern void hypervisor_callback(void);
16980 +extern void failsafe_callback(void);
16981 +extern void nmi(void);
16982 +
16983 +unsigned long *machine_to_phys_mapping;
16984 +EXPORT_SYMBOL(machine_to_phys_mapping);
16985 +unsigned int machine_to_phys_order;
16986 +EXPORT_SYMBOL(machine_to_phys_order);
16987 +
16988 +void __init machine_specific_arch_setup(void)
16989 +{
16990 +       int ret;
16991 +       struct xen_machphys_mapping mapping;
16992 +       unsigned long machine_to_phys_nr_ents;
16993 +       struct xen_platform_parameters pp;
16994 +       static struct callback_register __initdata event = {
16995 +               .type = CALLBACKTYPE_event,
16996 +               .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
16997 +       };
16998 +       static struct callback_register __initdata failsafe = {
16999 +               .type = CALLBACKTYPE_failsafe,
17000 +               .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17001 +       };
17002 +       static struct callback_register __initdata nmi_cb = {
17003 +               .type = CALLBACKTYPE_nmi,
17004 +               .address = { __KERNEL_CS, (unsigned long)nmi },
17005 +       };
17006 +
17007 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17008 +       if (ret == 0)
17009 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17010 +#ifdef CONFIG_XEN_COMPAT_030002
17011 +       if (ret == -ENOSYS)
17012 +               ret = HYPERVISOR_set_callbacks(
17013 +                       event.address.cs, event.address.eip,
17014 +                       failsafe.address.cs, failsafe.address.eip);
17015 +#endif
17016 +       BUG_ON(ret);
17017 +
17018 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17019 +#ifdef CONFIG_XEN_COMPAT_030002
17020 +       if (ret == -ENOSYS) {
17021 +               static struct xennmi_callback __initdata cb = {
17022 +                       .handler_address = (unsigned long)nmi
17023 +               };
17024 +
17025 +               cb.handler_address = nmi_cb.address.eip;
17026 +               HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17027 +       }
17028 +#endif
17029 +
17030 +       if (HYPERVISOR_xen_version(XENVER_platform_parameters,
17031 +                                  &pp) == 0) {
17032 +               hypervisor_virt_start = pp.virt_start;
17033 +               set_fixaddr_top();
17034 +       }
17035 +
17036 +       machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
17037 +       machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
17038 +       if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
17039 +               machine_to_phys_mapping = (unsigned long *)mapping.v_start;
17040 +               machine_to_phys_nr_ents = mapping.max_mfn + 1;
17041 +       }
17042 +       while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
17043 +               machine_to_phys_order++;
17044 +}
17045 +
17046 +/**
17047 + * pre_setup_arch_hook - hook called prior to any setup_arch() execution
17048 + *
17049 + * Description:
17050 + *     generally used to activate any machine specific identification
17051 + *     routines that may be needed before setup_arch() runs.  On VISWS
17052 + *     this is used to get the board revision and type.
17053 + **/
17054 +void __init pre_setup_arch_hook(void)
17055 +{
17056 +       int max_cmdline;
17057 +
17058 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
17059 +               max_cmdline = COMMAND_LINE_SIZE;
17060 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
17061 +       /* Save unparsed command line copy for /proc/cmdline */
17062 +       saved_command_line[max_cmdline-1] = '\0';
17063 +}
17064 diff -ruNp linux-2.6.19/arch/i386/mm/Makefile linux-2.6.19-xen-3.0.4/arch/i386/mm/Makefile
17065 --- linux-2.6.19/arch/i386/mm/Makefile  2006-11-29 21:57:37.000000000 +0000
17066 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/Makefile        2007-02-02 19:10:21.000000000 +0000
17067 @@ -8,3 +8,11 @@ obj-$(CONFIG_NUMA) += discontig.o
17068  obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
17069  obj-$(CONFIG_HIGHMEM) += highmem.o
17070  obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
17071 +
17072 +ifdef CONFIG_XEN
17073 +include $(srctree)/scripts/Makefile.xen
17074 +
17075 +obj-y          += hypervisor.o
17076 +
17077 +obj-y := $(call cherrypickxen, $(obj-y))
17078 +endif
17079 diff -ruNp linux-2.6.19/arch/i386/mm/fault-xen.c linux-2.6.19-xen-3.0.4/arch/i386/mm/fault-xen.c
17080 --- linux-2.6.19/arch/i386/mm/fault-xen.c       1970-01-01 00:00:00.000000000 +0000
17081 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/fault-xen.c     2007-02-02 19:10:21.000000000 +0000
17082 @@ -0,0 +1,756 @@
17083 +/*
17084 + *  linux/arch/i386/mm/fault.c
17085 + *
17086 + *  Copyright (C) 1995  Linus Torvalds
17087 + */
17088 +
17089 +#include <linux/signal.h>
17090 +#include <linux/sched.h>
17091 +#include <linux/kernel.h>
17092 +#include <linux/errno.h>
17093 +#include <linux/string.h>
17094 +#include <linux/types.h>
17095 +#include <linux/ptrace.h>
17096 +#include <linux/mman.h>
17097 +#include <linux/mm.h>
17098 +#include <linux/smp.h>
17099 +#include <linux/smp_lock.h>
17100 +#include <linux/interrupt.h>
17101 +#include <linux/init.h>
17102 +#include <linux/tty.h>
17103 +#include <linux/vt_kern.h>             /* For unblank_screen() */
17104 +#include <linux/highmem.h>
17105 +#include <linux/module.h>
17106 +#include <linux/kprobes.h>
17107 +
17108 +#include <asm/system.h>
17109 +#include <asm/uaccess.h>
17110 +#include <asm/desc.h>
17111 +#include <asm/kdebug.h>
17112 +#include <asm/segment.h>
17113 +
17114 +extern void die(const char *,struct pt_regs *,long);
17115 +
17116 +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
17117 +
17118 +int register_page_fault_notifier(struct notifier_block *nb)
17119 +{
17120 +       vmalloc_sync_all();
17121 +       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
17122 +}
17123 +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
17124 +
17125 +int unregister_page_fault_notifier(struct notifier_block *nb)
17126 +{
17127 +       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
17128 +}
17129 +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
17130 +
17131 +static inline int notify_page_fault(enum die_val val, const char *str,
17132 +                       struct pt_regs *regs, long err, int trap, int sig)
17133 +{
17134 +       struct die_args args = {
17135 +               .regs = regs,
17136 +               .str = str,
17137 +               .err = err,
17138 +               .trapnr = trap,
17139 +               .signr = sig
17140 +       };
17141 +       return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
17142 +}
17143 +
17144 +/*
17145 + * Unlock any spinlocks which will prevent us from getting the
17146 + * message out 
17147 + */
17148 +void bust_spinlocks(int yes)
17149 +{
17150 +       int loglevel_save = console_loglevel;
17151 +
17152 +       if (yes) {
17153 +               oops_in_progress = 1;
17154 +               return;
17155 +       }
17156 +#ifdef CONFIG_VT
17157 +       unblank_screen();
17158 +#endif
17159 +       oops_in_progress = 0;
17160 +       /*
17161 +        * OK, the message is on the console.  Now we call printk()
17162 +        * without oops_in_progress set so that printk will give klogd
17163 +        * a poke.  Hold onto your hats...
17164 +        */
17165 +       console_loglevel = 15;          /* NMI oopser may have shut the console up */
17166 +       printk(" ");
17167 +       console_loglevel = loglevel_save;
17168 +}
17169 +
17170 +/*
17171 + * Return EIP plus the CS segment base.  The segment limit is also
17172 + * adjusted, clamped to the kernel/user address space (whichever is
17173 + * appropriate), and returned in *eip_limit.
17174 + *
17175 + * The segment is checked, because it might have been changed by another
17176 + * task between the original faulting instruction and here.
17177 + *
17178 + * If CS is no longer a valid code segment, or if EIP is beyond the
17179 + * limit, or if it is a kernel address when CS is not a kernel segment,
17180 + * then the returned value will be greater than *eip_limit.
17181 + * 
17182 + * This is slow, but is very rarely executed.
17183 + */
17184 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
17185 +                                           unsigned long *eip_limit)
17186 +{
17187 +       unsigned long eip = regs->eip;
17188 +       unsigned seg = regs->xcs & 0xffff;
17189 +       u32 seg_ar, seg_limit, base, *desc;
17190 +
17191 +       /* Unlikely, but must come before segment checks. */
17192 +       if (unlikely(regs->eflags & VM_MASK)) {
17193 +               base = seg << 4;
17194 +               *eip_limit = base + 0xffff;
17195 +               return base + (eip & 0xffff);
17196 +       }
17197 +
17198 +       /* The standard kernel/user address space limit. */
17199 +       *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
17200 +       
17201 +       /* By far the most common cases. */
17202 +       if (likely(seg == SEGMENT_IS_FLAT_CODE(seg)))
17203 +               return eip;
17204 +
17205 +       /* Check the segment exists, is within the current LDT/GDT size,
17206 +          that kernel/user (ring 0..3) has the appropriate privilege,
17207 +          that it's a code segment, and get the limit. */
17208 +       __asm__ ("larl %3,%0; lsll %3,%1"
17209 +                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
17210 +       if ((~seg_ar & 0x9800) || eip > seg_limit) {
17211 +               *eip_limit = 0;
17212 +               return 1;        /* So that returned eip > *eip_limit. */
17213 +       }
17214 +
17215 +       /* Get the GDT/LDT descriptor base. 
17216 +          When you look for races in this code remember that
17217 +          LDT and other horrors are only used in user space. */
17218 +       if (seg & (1<<2)) {
17219 +               /* Must lock the LDT while reading it. */
17220 +               down(&current->mm->context.sem);
17221 +               desc = current->mm->context.ldt;
17222 +               desc = (void *)desc + (seg & ~7);
17223 +       } else {
17224 +               /* Must disable preemption while reading the GDT. */
17225 +               desc = (u32 *)get_cpu_gdt_table(get_cpu());
17226 +               desc = (void *)desc + (seg & ~7);
17227 +       }
17228 +
17229 +       /* Decode the code segment base from the descriptor */
17230 +       base = get_desc_base((unsigned long *)desc);
17231 +
17232 +       if (seg & (1<<2)) { 
17233 +               up(&current->mm->context.sem);
17234 +       } else
17235 +               put_cpu();
17236 +
17237 +       /* Adjust EIP and segment limit, and clamp at the kernel limit.
17238 +          It's legitimate for segments to wrap at 0xffffffff. */
17239 +       seg_limit += base;
17240 +       if (seg_limit < *eip_limit && seg_limit >= base)
17241 +               *eip_limit = seg_limit;
17242 +       return eip + base;
17243 +}
17244 +
17245 +/* 
17246 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
17247 + * Check that here and ignore it.
17248 + */
17249 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
17250 +{ 
17251 +       unsigned long limit;
17252 +       unsigned long instr = get_segment_eip (regs, &limit);
17253 +       int scan_more = 1;
17254 +       int prefetch = 0; 
17255 +       int i;
17256 +
17257 +       for (i = 0; scan_more && i < 15; i++) { 
17258 +               unsigned char opcode;
17259 +               unsigned char instr_hi;
17260 +               unsigned char instr_lo;
17261 +
17262 +               if (instr > limit)
17263 +                       break;
17264 +               if (__get_user(opcode, (unsigned char __user *) instr))
17265 +                       break; 
17266 +
17267 +               instr_hi = opcode & 0xf0; 
17268 +               instr_lo = opcode & 0x0f; 
17269 +               instr++;
17270 +
17271 +               switch (instr_hi) { 
17272 +               case 0x20:
17273 +               case 0x30:
17274 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
17275 +                       scan_more = ((instr_lo & 7) == 0x6);
17276 +                       break;
17277 +                       
17278 +               case 0x60:
17279 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
17280 +                       scan_more = (instr_lo & 0xC) == 0x4;
17281 +                       break;          
17282 +               case 0xF0:
17283 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
17284 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
17285 +                       break;                  
17286 +               case 0x00:
17287 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
17288 +                       scan_more = 0;
17289 +                       if (instr > limit)
17290 +                               break;
17291 +                       if (__get_user(opcode, (unsigned char __user *) instr))
17292 +                               break;
17293 +                       prefetch = (instr_lo == 0xF) &&
17294 +                               (opcode == 0x0D || opcode == 0x18);
17295 +                       break;                  
17296 +               default:
17297 +                       scan_more = 0;
17298 +                       break;
17299 +               } 
17300 +       }
17301 +       return prefetch;
17302 +}
17303 +
17304 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
17305 +                             unsigned long error_code)
17306 +{
17307 +       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
17308 +                    boot_cpu_data.x86 >= 6)) {
17309 +               /* Catch an obscure case of prefetch inside an NX page. */
17310 +               if (nx_enabled && (error_code & 16))
17311 +                       return 0;
17312 +               return __is_prefetch(regs, addr);
17313 +       }
17314 +       return 0;
17315 +} 
17316 +
17317 +static noinline void force_sig_info_fault(int si_signo, int si_code,
17318 +       unsigned long address, struct task_struct *tsk)
17319 +{
17320 +       siginfo_t info;
17321 +
17322 +       info.si_signo = si_signo;
17323 +       info.si_errno = 0;
17324 +       info.si_code = si_code;
17325 +       info.si_addr = (void __user *)address;
17326 +       force_sig_info(si_signo, &info, tsk);
17327 +}
17328 +
17329 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
17330 +
17331 +#ifdef CONFIG_X86_PAE
17332 +static void dump_fault_path(unsigned long address)
17333 +{
17334 +       unsigned long *p, page;
17335 +       unsigned long mfn; 
17336 +
17337 +       page = read_cr3();
17338 +       p  = (unsigned long *)__va(page);
17339 +       p += (address >> 30) * 2;
17340 +       printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
17341 +       if (p[0] & 1) {
17342 +               mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
17343 +               page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
17344 +               p  = (unsigned long *)__va(page);
17345 +               address &= 0x3fffffff;
17346 +               p += (address >> 21) * 2;
17347 +               printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", 
17348 +                      page, p[1], p[0]);
17349 +#ifndef CONFIG_HIGHPTE
17350 +               if (p[0] & 1) {
17351 +                       mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
17352 +                       page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
17353 +                       p  = (unsigned long *) __va(page);
17354 +                       address &= 0x001fffff;
17355 +                       p += (address >> 12) * 2;
17356 +                       printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
17357 +                              page, p[1], p[0]);
17358 +               }
17359 +#endif
17360 +       }
17361 +}
17362 +#else
17363 +static void dump_fault_path(unsigned long address)
17364 +{
17365 +       unsigned long page;
17366 +
17367 +       page = read_cr3();
17368 +       page = ((unsigned long *) __va(page))[address >> 22];
17369 +       if (oops_may_print())
17370 +               printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
17371 +                      machine_to_phys(page));
17372 +       /*
17373 +        * We must not directly access the pte in the highpte
17374 +        * case, the page table might be allocated in highmem.
17375 +        * And lets rather not kmap-atomic the pte, just in case
17376 +        * it's allocated already.
17377 +        */
17378 +#ifndef CONFIG_HIGHPTE
17379 +       if ((page & 1) && oops_may_print()) {
17380 +               page &= PAGE_MASK;
17381 +               address &= 0x003ff000;
17382 +               page = machine_to_phys(page);
17383 +               page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
17384 +               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
17385 +                      machine_to_phys(page));
17386 +       }
17387 +#endif
17388 +}
17389 +#endif
17390 +
17391 +static int spurious_fault(struct pt_regs *regs,
17392 +                         unsigned long address,
17393 +                         unsigned long error_code)
17394 +{
17395 +       pgd_t *pgd;
17396 +       pud_t *pud;
17397 +       pmd_t *pmd;
17398 +       pte_t *pte;
17399 +
17400 +       /* Reserved-bit violation or user access to kernel space? */
17401 +       if (error_code & 0x0c)
17402 +               return 0;
17403 +
17404 +       pgd = init_mm.pgd + pgd_index(address);
17405 +       if (!pgd_present(*pgd))
17406 +               return 0;
17407 +
17408 +       pud = pud_offset(pgd, address);
17409 +       if (!pud_present(*pud))
17410 +               return 0;
17411 +
17412 +       pmd = pmd_offset(pud, address);
17413 +       if (!pmd_present(*pmd))
17414 +               return 0;
17415 +
17416 +       pte = pte_offset_kernel(pmd, address);
17417 +       if (!pte_present(*pte))
17418 +               return 0;
17419 +       if ((error_code & 0x02) && !pte_write(*pte))
17420 +               return 0;
17421 +#ifdef CONFIG_X86_PAE
17422 +       if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
17423 +               return 0;
17424 +#endif
17425 +
17426 +       return 1;
17427 +}
17428 +
17429 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
17430 +{
17431 +       unsigned index = pgd_index(address);
17432 +       pgd_t *pgd_k;
17433 +       pud_t *pud, *pud_k;
17434 +       pmd_t *pmd, *pmd_k;
17435 +
17436 +       pgd += index;
17437 +       pgd_k = init_mm.pgd + index;
17438 +
17439 +       if (!pgd_present(*pgd_k))
17440 +               return NULL;
17441 +
17442 +       /*
17443 +        * set_pgd(pgd, *pgd_k); here would be useless on PAE
17444 +        * and redundant with the set_pmd() on non-PAE. As would
17445 +        * set_pud.
17446 +        */
17447 +
17448 +       pud = pud_offset(pgd, address);
17449 +       pud_k = pud_offset(pgd_k, address);
17450 +       if (!pud_present(*pud_k))
17451 +               return NULL;
17452 +
17453 +       pmd = pmd_offset(pud, address);
17454 +       pmd_k = pmd_offset(pud_k, address);
17455 +       if (!pmd_present(*pmd_k))
17456 +               return NULL;
17457 +       if (!pmd_present(*pmd))
17458 +#ifndef CONFIG_XEN
17459 +               set_pmd(pmd, *pmd_k);
17460 +#else
17461 +               /*
17462 +                * When running on Xen we must launder *pmd_k through
17463 +                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
17464 +                */
17465 +               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
17466 +#endif
17467 +       else
17468 +               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
17469 +       return pmd_k;
17470 +}
17471 +
17472 +/*
17473 + * Handle a fault on the vmalloc or module mapping area
17474 + *
17475 + * This assumes no large pages in there.
17476 + */
17477 +static inline int vmalloc_fault(unsigned long address)
17478 +{
17479 +       unsigned long pgd_paddr;
17480 +       pmd_t *pmd_k;
17481 +       pte_t *pte_k;
17482 +       /*
17483 +        * Synchronize this task's top level page-table
17484 +        * with the 'reference' page table.
17485 +        *
17486 +        * Do _not_ use "current" here. We might be inside
17487 +        * an interrupt in the middle of a task switch..
17488 +        */
17489 +       pgd_paddr = read_cr3();
17490 +       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
17491 +       if (!pmd_k)
17492 +               return -1;
17493 +       pte_k = pte_offset_kernel(pmd_k, address);
17494 +       if (!pte_present(*pte_k))
17495 +               return -1;
17496 +       return 0;
17497 +}
17498 +
17499 +/*
17500 + * This routine handles page faults.  It determines the address,
17501 + * and the problem, and then passes it off to one of the appropriate
17502 + * routines.
17503 + *
17504 + * error_code:
17505 + *     bit 0 == 0 means no page found, 1 means protection fault
17506 + *     bit 1 == 0 means read, 1 means write
17507 + *     bit 2 == 0 means kernel, 1 means user-mode
17508 + *     bit 3 == 1 means use of reserved bit detected
17509 + *     bit 4 == 1 means fault was an instruction fetch
17510 + */
17511 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
17512 +                                     unsigned long error_code)
17513 +{
17514 +       struct task_struct *tsk;
17515 +       struct mm_struct *mm;
17516 +       struct vm_area_struct * vma;
17517 +       unsigned long address;
17518 +       int write, si_code;
17519 +
17520 +       /* get the address */
17521 +        address = read_cr2();
17522 +
17523 +       /* Set the "privileged fault" bit to something sane. */
17524 +       error_code &= ~4;
17525 +       error_code |= (regs->xcs & 2) << 1;
17526 +       if (regs->eflags & X86_EFLAGS_VM)
17527 +               error_code |= 4;
17528 +
17529 +       tsk = current;
17530 +
17531 +       si_code = SEGV_MAPERR;
17532 +
17533 +       /*
17534 +        * We fault-in kernel-space virtual memory on-demand. The
17535 +        * 'reference' page table is init_mm.pgd.
17536 +        *
17537 +        * NOTE! We MUST NOT take any locks for this case. We may
17538 +        * be in an interrupt or a critical region, and should
17539 +        * only copy the information from the master page table,
17540 +        * nothing more.
17541 +        *
17542 +        * This verifies that the fault happens in kernel space
17543 +        * (error_code & 4) == 0, and that the fault was not a
17544 +        * protection error (error_code & 9) == 0.
17545 +        */
17546 +       if (unlikely(address >= TASK_SIZE)) {
17547 +#ifdef CONFIG_XEN
17548 +               /* Faults in hypervisor area can never be patched up. */
17549 +               if (address >= hypervisor_virt_start)
17550 +                       goto bad_area_nosemaphore;
17551 +#endif
17552 +               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
17553 +                       return;
17554 +               /* Can take a spurious fault if mapping changes R/O -> R/W. */
17555 +               if (spurious_fault(regs, address, error_code))
17556 +                       return;
17557 +               if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
17558 +                                               SIGSEGV) == NOTIFY_STOP)
17559 +                       return;
17560 +               /*
17561 +                * Don't take the mm semaphore here. If we fixup a prefetch
17562 +                * fault we could otherwise deadlock.
17563 +                */
17564 +               goto bad_area_nosemaphore;
17565 +       }
17566 +
17567 +       if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
17568 +                                       SIGSEGV) == NOTIFY_STOP)
17569 +               return;
17570 +
17571 +       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
17572 +          fault has been handled. */
17573 +       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
17574 +               local_irq_enable();
17575 +
17576 +       mm = tsk->mm;
17577 +
17578 +       /*
17579 +        * If we're in an interrupt, have no user context or are running in an
17580 +        * atomic region then we must not take the fault..
17581 +        */
17582 +       if (in_atomic() || !mm)
17583 +               goto bad_area_nosemaphore;
17584 +
17585 +       /* When running in the kernel we expect faults to occur only to
17586 +        * addresses in user space.  All other faults represent errors in the
17587 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
17588 +        * erroneous fault occurring in a code path which already holds mmap_sem
17589 +        * we will deadlock attempting to validate the fault against the
17590 +        * address space.  Luckily the kernel only validly references user
17591 +        * space from well defined areas of code, which are listed in the
17592 +        * exceptions table.
17593 +        *
17594 +        * As the vast majority of faults will be valid we will only perform
17595 +        * the source reference check when there is a possibilty of a deadlock.
17596 +        * Attempt to lock the address space, if we cannot we then validate the
17597 +        * source.  If this is invalid we can skip the address space check,
17598 +        * thus avoiding the deadlock.
17599 +        */
17600 +       if (!down_read_trylock(&mm->mmap_sem)) {
17601 +               if ((error_code & 4) == 0 &&
17602 +                   !search_exception_tables(regs->eip))
17603 +                       goto bad_area_nosemaphore;
17604 +               down_read(&mm->mmap_sem);
17605 +       }
17606 +
17607 +       vma = find_vma(mm, address);
17608 +       if (!vma)
17609 +               goto bad_area;
17610 +       if (vma->vm_start <= address)
17611 +               goto good_area;
17612 +       if (!(vma->vm_flags & VM_GROWSDOWN))
17613 +               goto bad_area;
17614 +       if (error_code & 4) {
17615 +               /*
17616 +                * Accessing the stack below %esp is always a bug.
17617 +                * The large cushion allows instructions like enter
17618 +                * and pusha to work.  ("enter $65535,$31" pushes
17619 +                * 32 pointers and then decrements %esp by 65535.)
17620 +                */
17621 +               if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
17622 +                       goto bad_area;
17623 +       }
17624 +       if (expand_stack(vma, address))
17625 +               goto bad_area;
17626 +/*
17627 + * Ok, we have a good vm_area for this memory access, so
17628 + * we can handle it..
17629 + */
17630 +good_area:
17631 +       si_code = SEGV_ACCERR;
17632 +       write = 0;
17633 +       switch (error_code & 3) {
17634 +               default:        /* 3: write, present */
17635 +                               /* fall through */
17636 +               case 2:         /* write, not present */
17637 +                       if (!(vma->vm_flags & VM_WRITE))
17638 +                               goto bad_area;
17639 +                       write++;
17640 +                       break;
17641 +               case 1:         /* read, present */
17642 +                       goto bad_area;
17643 +               case 0:         /* read, not present */
17644 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
17645 +                               goto bad_area;
17646 +       }
17647 +
17648 + survive:
17649 +       /*
17650 +        * If for any reason at all we couldn't handle the fault,
17651 +        * make sure we exit gracefully rather than endlessly redo
17652 +        * the fault.
17653 +        */
17654 +       switch (handle_mm_fault(mm, vma, address, write)) {
17655 +               case VM_FAULT_MINOR:
17656 +                       tsk->min_flt++;
17657 +                       break;
17658 +               case VM_FAULT_MAJOR:
17659 +                       tsk->maj_flt++;
17660 +                       break;
17661 +               case VM_FAULT_SIGBUS:
17662 +                       goto do_sigbus;
17663 +               case VM_FAULT_OOM:
17664 +                       goto out_of_memory;
17665 +               default:
17666 +                       BUG();
17667 +       }
17668 +
17669 +       /*
17670 +        * Did it hit the DOS screen memory VA from vm86 mode?
17671 +        */
17672 +       if (regs->eflags & VM_MASK) {
17673 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
17674 +               if (bit < 32)
17675 +                       tsk->thread.screen_bitmap |= 1 << bit;
17676 +       }
17677 +       up_read(&mm->mmap_sem);
17678 +       return;
17679 +
17680 +/*
17681 + * Something tried to access memory that isn't in our memory map..
17682 + * Fix it, but check if it's kernel or user first..
17683 + */
17684 +bad_area:
17685 +       up_read(&mm->mmap_sem);
17686 +
17687 +bad_area_nosemaphore:
17688 +       /* User mode accesses just cause a SIGSEGV */
17689 +       if (error_code & 4) {
17690 +               /* 
17691 +                * Valid to do another page fault here because this one came 
17692 +                * from user space.
17693 +                */
17694 +               if (is_prefetch(regs, address, error_code))
17695 +                       return;
17696 +
17697 +               tsk->thread.cr2 = address;
17698 +               /* Kernel addresses are always protection faults */
17699 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
17700 +               tsk->thread.trap_no = 14;
17701 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
17702 +               return;
17703 +       }
17704 +
17705 +#ifdef CONFIG_X86_F00F_BUG
17706 +       /*
17707 +        * Pentium F0 0F C7 C8 bug workaround.
17708 +        */
17709 +       if (boot_cpu_data.f00f_bug) {
17710 +               unsigned long nr;
17711 +               
17712 +               nr = (address - idt_descr.address) >> 3;
17713 +
17714 +               if (nr == 6) {
17715 +                       do_invalid_op(regs, 0);
17716 +                       return;
17717 +               }
17718 +       }
17719 +#endif
17720 +
17721 +no_context:
17722 +       /* Are we prepared to handle this kernel fault?  */
17723 +       if (fixup_exception(regs))
17724 +               return;
17725 +
17726 +       /* 
17727 +        * Valid to do another page fault here, because if this fault
17728 +        * had been triggered by is_prefetch fixup_exception would have 
17729 +        * handled it.
17730 +        */
17731 +       if (is_prefetch(regs, address, error_code))
17732 +               return;
17733 +
17734 +/*
17735 + * Oops. The kernel tried to access some bad page. We'll have to
17736 + * terminate things with extreme prejudice.
17737 + */
17738 +
17739 +       bust_spinlocks(1);
17740 +
17741 +       if (oops_may_print()) {
17742 +       #ifdef CONFIG_X86_PAE
17743 +               if (error_code & 16) {
17744 +                       pte_t *pte = lookup_address(address);
17745 +
17746 +                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
17747 +                               printk(KERN_CRIT "kernel tried to execute "
17748 +                                       "NX-protected page - exploit attempt? "
17749 +                                       "(uid: %d)\n", current->uid);
17750 +               }
17751 +       #endif
17752 +               if (address < PAGE_SIZE)
17753 +                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
17754 +                                       "pointer dereference");
17755 +               else
17756 +                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
17757 +                                       " request");
17758 +               printk(" at virtual address %08lx\n",address);
17759 +               printk(KERN_ALERT " printing eip:\n");
17760 +               printk("%08lx\n", regs->eip);
17761 +               dump_fault_path(address);
17762 +       }
17763 +       tsk->thread.cr2 = address;
17764 +       tsk->thread.trap_no = 14;
17765 +       tsk->thread.error_code = error_code;
17766 +       die("Oops", regs, error_code);
17767 +       bust_spinlocks(0);
17768 +       do_exit(SIGKILL);
17769 +
17770 +/*
17771 + * We ran out of memory, or some other thing happened to us that made
17772 + * us unable to handle the page fault gracefully.
17773 + */
17774 +out_of_memory:
17775 +       up_read(&mm->mmap_sem);
17776 +       if (is_init(tsk)) {
17777 +               yield();
17778 +               down_read(&mm->mmap_sem);
17779 +               goto survive;
17780 +       }
17781 +       printk("VM: killing process %s\n", tsk->comm);
17782 +       if (error_code & 4)
17783 +               do_exit(SIGKILL);
17784 +       goto no_context;
17785 +
17786 +do_sigbus:
17787 +       up_read(&mm->mmap_sem);
17788 +
17789 +       /* Kernel mode? Handle exceptions or die */
17790 +       if (!(error_code & 4))
17791 +               goto no_context;
17792 +
17793 +       /* User space => ok to do another page fault */
17794 +       if (is_prefetch(regs, address, error_code))
17795 +               return;
17796 +
17797 +       tsk->thread.cr2 = address;
17798 +       tsk->thread.error_code = error_code;
17799 +       tsk->thread.trap_no = 14;
17800 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
17801 +}
17802 +
17803 +#ifndef CONFIG_X86_PAE
17804 +void vmalloc_sync_all(void)
17805 +{
17806 +       /*
17807 +        * Note that races in the updates of insync and start aren't
17808 +        * problematic: insync can only get set bits added, and updates to
17809 +        * start are only improving performance (without affecting correctness
17810 +        * if undone).
17811 +        */
17812 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
17813 +       static unsigned long start = TASK_SIZE;
17814 +       unsigned long address;
17815 +
17816 +       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
17817 +       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
17818 +               if (!test_bit(pgd_index(address), insync)) {
17819 +                       unsigned long flags;
17820 +                       struct page *page;
17821 +
17822 +                       spin_lock_irqsave(&pgd_lock, flags);
17823 +                       for (page = pgd_list; page; page =
17824 +                                       (struct page *)page->index)
17825 +                               if (!vmalloc_sync_one(page_address(page),
17826 +                                                               address)) {
17827 +                                       BUG_ON(page != pgd_list);
17828 +                                       break;
17829 +                               }
17830 +                       spin_unlock_irqrestore(&pgd_lock, flags);
17831 +                       if (!page)
17832 +                               set_bit(pgd_index(address), insync);
17833 +               }
17834 +               if (address == start && test_bit(pgd_index(address), insync))
17835 +                       start = address + PGDIR_SIZE;
17836 +       }
17837 +}
17838 +#endif
17839 diff -ruNp linux-2.6.19/arch/i386/mm/highmem-xen.c linux-2.6.19-xen-3.0.4/arch/i386/mm/highmem-xen.c
17840 --- linux-2.6.19/arch/i386/mm/highmem-xen.c     1970-01-01 00:00:00.000000000 +0000
17841 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/highmem-xen.c   2007-02-02 19:10:21.000000000 +0000
17842 @@ -0,0 +1,121 @@
17843 +#include <linux/highmem.h>
17844 +#include <linux/module.h>
17845 +
17846 +void *kmap(struct page *page)
17847 +{
17848 +       might_sleep();
17849 +       if (!PageHighMem(page))
17850 +               return page_address(page);
17851 +       return kmap_high(page);
17852 +}
17853 +
17854 +void kunmap(struct page *page)
17855 +{
17856 +       if (in_interrupt())
17857 +               BUG();
17858 +       if (!PageHighMem(page))
17859 +               return;
17860 +       kunmap_high(page);
17861 +}
17862 +
17863 +/*
17864 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
17865 + * no global lock is needed and because the kmap code must perform a global TLB
17866 + * invalidation when the kmap pool wraps.
17867 + *
17868 + * However when holding an atomic kmap is is not legal to sleep, so atomic
17869 + * kmaps are appropriate for short, tight code paths only.
17870 + */
17871 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
17872 +{
17873 +       enum fixed_addresses idx;
17874 +       unsigned long vaddr;
17875 +
17876 +       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
17877 +       inc_preempt_count();
17878 +       if (!PageHighMem(page))
17879 +               return page_address(page);
17880 +
17881 +       idx = type + KM_TYPE_NR*smp_processor_id();
17882 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
17883 +       if (!pte_none(*(kmap_pte-idx)))
17884 +               BUG();
17885 +       set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
17886 +
17887 +       return (void*) vaddr;
17888 +}
17889 +
17890 +void *kmap_atomic(struct page *page, enum km_type type)
17891 +{
17892 +       return __kmap_atomic(page, type, kmap_prot);
17893 +}
17894 +
17895 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
17896 +void *kmap_atomic_pte(struct page *page, enum km_type type)
17897 +{
17898 +       return __kmap_atomic(page, type, PAGE_KERNEL_RO);
17899 +}
17900 +
17901 +void kunmap_atomic(void *kvaddr, enum km_type type)
17902 +{
17903 +       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
17904 +       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
17905 +
17906 +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
17907 +       if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
17908 +               dec_preempt_count();
17909 +               preempt_check_resched();
17910 +               return;
17911 +       }
17912 +
17913 +       if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
17914 +               BUG();
17915 +#endif
17916 +       /*
17917 +        * Force other mappings to Oops if they'll try to access this pte
17918 +        * without first remap it.  Keeping stale mappings around is a bad idea
17919 +        * also, in case the page changes cacheability attributes or becomes
17920 +        * a protected page in a hypervisor.
17921 +        */
17922 +       kpte_clear_flush(kmap_pte-idx, vaddr);
17923 +       __flush_tlb_one(vaddr);
17924 +
17925 +       dec_preempt_count();
17926 +       preempt_check_resched();
17927 +}
17928 +
17929 +/* This is the same as kmap_atomic() but can map memory that doesn't
17930 + * have a struct page associated with it.
17931 + */
17932 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
17933 +{
17934 +       enum fixed_addresses idx;
17935 +       unsigned long vaddr;
17936 +
17937 +       inc_preempt_count();
17938 +
17939 +       idx = type + KM_TYPE_NR*smp_processor_id();
17940 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
17941 +       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
17942 +
17943 +       return (void*) vaddr;
17944 +}
17945 +
17946 +struct page *kmap_atomic_to_page(void *ptr)
17947 +{
17948 +       unsigned long idx, vaddr = (unsigned long)ptr;
17949 +       pte_t *pte;
17950 +
17951 +       if (vaddr < FIXADDR_START)
17952 +               return virt_to_page(ptr);
17953 +
17954 +       idx = virt_to_fix(vaddr);
17955 +       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
17956 +       return pte_page(*pte);
17957 +}
17958 +
17959 +EXPORT_SYMBOL(kmap);
17960 +EXPORT_SYMBOL(kunmap);
17961 +EXPORT_SYMBOL(kmap_atomic);
17962 +EXPORT_SYMBOL(kunmap_atomic);
17963 +EXPORT_SYMBOL(kmap_atomic_to_page);
17964 diff -ruNp linux-2.6.19/arch/i386/mm/hypervisor.c linux-2.6.19-xen-3.0.4/arch/i386/mm/hypervisor.c
17965 --- linux-2.6.19/arch/i386/mm/hypervisor.c      1970-01-01 00:00:00.000000000 +0000
17966 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/hypervisor.c    2007-02-02 19:10:21.000000000 +0000
17967 @@ -0,0 +1,449 @@
17968 +/******************************************************************************
17969 + * mm/hypervisor.c
17970 + * 
17971 + * Update page tables via the hypervisor.
17972 + * 
17973 + * Copyright (c) 2002-2004, K A Fraser
17974 + * 
17975 + * This program is free software; you can redistribute it and/or
17976 + * modify it under the terms of the GNU General Public License version 2
17977 + * as published by the Free Software Foundation; or, when distributed
17978 + * separately from the Linux kernel or incorporated into other
17979 + * software packages, subject to the following license:
17980 + * 
17981 + * Permission is hereby granted, free of charge, to any person obtaining a copy
17982 + * of this source file (the "Software"), to deal in the Software without
17983 + * restriction, including without limitation the rights to use, copy, modify,
17984 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17985 + * and to permit persons to whom the Software is furnished to do so, subject to
17986 + * the following conditions:
17987 + * 
17988 + * The above copyright notice and this permission notice shall be included in
17989 + * all copies or substantial portions of the Software.
17990 + * 
17991 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17992 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17993 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17994 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17995 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17996 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
17997 + * IN THE SOFTWARE.
17998 + */
17999 +
18000 +#include <linux/sched.h>
18001 +#include <linux/mm.h>
18002 +#include <linux/vmalloc.h>
18003 +#include <asm/page.h>
18004 +#include <asm/pgtable.h>
18005 +#include <asm/hypervisor.h>
18006 +#include <xen/balloon.h>
18007 +#include <xen/features.h>
18008 +#include <xen/interface/memory.h>
18009 +#include <linux/module.h>
18010 +#include <linux/percpu.h>
18011 +#include <asm/tlbflush.h>
18012 +
18013 +#ifdef CONFIG_X86_64
18014 +#define pmd_val_ma(v) (v).pmd
18015 +#else
18016 +#ifdef CONFIG_X86_PAE
18017 +# define pmd_val_ma(v) ((v).pmd)
18018 +# define pud_val_ma(v) ((v).pgd.pgd)
18019 +#else
18020 +# define pmd_val_ma(v) ((v).pud.pgd.pgd)
18021 +#endif
18022 +#endif
18023 +
18024 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
18025 +{
18026 +       mmu_update_t u;
18027 +       u.ptr = virt_to_machine(ptr);
18028 +       u.val = pte_val_ma(val);
18029 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
18030 +}
18031 +
18032 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
18033 +{
18034 +       mmu_update_t u;
18035 +       u.ptr = virt_to_machine(ptr);
18036 +       u.val = pmd_val_ma(val);
18037 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
18038 +}
18039 +
18040 +#ifdef CONFIG_X86_PAE
18041 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
18042 +{
18043 +       mmu_update_t u;
18044 +       u.ptr = virt_to_machine(ptr);
18045 +       u.val = pud_val_ma(val);
18046 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
18047 +}
18048 +#endif
18049 +
18050 +#ifdef CONFIG_X86_64
18051 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
18052 +{
18053 +       mmu_update_t u;
18054 +       u.ptr = virt_to_machine(ptr);
18055 +       u.val = val.pud;
18056 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
18057 +}
18058 +
18059 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
18060 +{
18061 +       mmu_update_t u;
18062 +       u.ptr = virt_to_machine(ptr);
18063 +       u.val = val.pgd;
18064 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
18065 +}
18066 +#endif /* CONFIG_X86_64 */
18067 +
18068 +void xen_pt_switch(unsigned long ptr)
18069 +{
18070 +       struct mmuext_op op;
18071 +       op.cmd = MMUEXT_NEW_BASEPTR;
18072 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
18073 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18074 +}
18075 +
18076 +void xen_new_user_pt(unsigned long ptr)
18077 +{
18078 +       struct mmuext_op op;
18079 +       op.cmd = MMUEXT_NEW_USER_BASEPTR;
18080 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
18081 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18082 +}
18083 +
18084 +void xen_tlb_flush(void)
18085 +{
18086 +       struct mmuext_op op;
18087 +       op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
18088 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18089 +}
18090 +EXPORT_SYMBOL(xen_tlb_flush);
18091 +
18092 +void xen_invlpg(unsigned long ptr)
18093 +{
18094 +       struct mmuext_op op;
18095 +       op.cmd = MMUEXT_INVLPG_LOCAL;
18096 +       op.arg1.linear_addr = ptr & PAGE_MASK;
18097 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18098 +}
18099 +EXPORT_SYMBOL(xen_invlpg);
18100 +
18101 +#ifdef CONFIG_SMP
18102 +
18103 +void xen_tlb_flush_all(void)
18104 +{
18105 +       struct mmuext_op op;
18106 +       op.cmd = MMUEXT_TLB_FLUSH_ALL;
18107 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18108 +}
18109 +
18110 +void xen_tlb_flush_mask(cpumask_t *mask)
18111 +{
18112 +       struct mmuext_op op;
18113 +       if ( cpus_empty(*mask) )
18114 +               return;
18115 +       op.cmd = MMUEXT_TLB_FLUSH_MULTI;
18116 +       op.arg2.vcpumask = mask->bits;
18117 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18118 +}
18119 +
18120 +void xen_invlpg_all(unsigned long ptr)
18121 +{
18122 +       struct mmuext_op op;
18123 +       op.cmd = MMUEXT_INVLPG_ALL;
18124 +       op.arg1.linear_addr = ptr & PAGE_MASK;
18125 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18126 +}
18127 +
18128 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
18129 +{
18130 +       struct mmuext_op op;
18131 +       if ( cpus_empty(*mask) )
18132 +               return;
18133 +       op.cmd = MMUEXT_INVLPG_MULTI;
18134 +       op.arg1.linear_addr = ptr & PAGE_MASK;
18135 +       op.arg2.vcpumask    = mask->bits;
18136 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18137 +}
18138 +
18139 +#endif /* CONFIG_SMP */
18140 +
18141 +void xen_pgd_pin(unsigned long ptr)
18142 +{
18143 +       struct mmuext_op op;
18144 +#ifdef CONFIG_X86_64
18145 +       op.cmd = MMUEXT_PIN_L4_TABLE;
18146 +#elif defined(CONFIG_X86_PAE)
18147 +       op.cmd = MMUEXT_PIN_L3_TABLE;
18148 +#else
18149 +       op.cmd = MMUEXT_PIN_L2_TABLE;
18150 +#endif
18151 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
18152 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18153 +}
18154 +
18155 +void xen_pgd_unpin(unsigned long ptr)
18156 +{
18157 +       struct mmuext_op op;
18158 +       op.cmd = MMUEXT_UNPIN_TABLE;
18159 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
18160 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18161 +}
18162 +
18163 +void xen_set_ldt(unsigned long ptr, unsigned long len)
18164 +{
18165 +       struct mmuext_op op;
18166 +       op.cmd = MMUEXT_SET_LDT;
18167 +       op.arg1.linear_addr = ptr;
18168 +       op.arg2.nr_ents     = len;
18169 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
18170 +}
18171 +
18172 +/*
18173 + * Bitmap is indexed by page number. If bit is set, the page is part of a
18174 + * xen_create_contiguous_region() area of memory.
18175 + */
18176 +unsigned long *contiguous_bitmap;
18177 +
18178 +static void contiguous_bitmap_set(
18179 +       unsigned long first_page, unsigned long nr_pages)
18180 +{
18181 +       unsigned long start_off, end_off, curr_idx, end_idx;
18182 +
18183 +       curr_idx  = first_page / BITS_PER_LONG;
18184 +       start_off = first_page & (BITS_PER_LONG-1);
18185 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
18186 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
18187 +
18188 +       if (curr_idx == end_idx) {
18189 +               contiguous_bitmap[curr_idx] |=
18190 +                       ((1UL<<end_off)-1) & -(1UL<<start_off);
18191 +       } else {
18192 +               contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
18193 +               while ( ++curr_idx < end_idx )
18194 +                       contiguous_bitmap[curr_idx] = ~0UL;
18195 +               contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
18196 +       }
18197 +}
18198 +
18199 +static void contiguous_bitmap_clear(
18200 +       unsigned long first_page, unsigned long nr_pages)
18201 +{
18202 +       unsigned long start_off, end_off, curr_idx, end_idx;
18203 +
18204 +       curr_idx  = first_page / BITS_PER_LONG;
18205 +       start_off = first_page & (BITS_PER_LONG-1);
18206 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
18207 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
18208 +
18209 +       if (curr_idx == end_idx) {
18210 +               contiguous_bitmap[curr_idx] &=
18211 +                       -(1UL<<end_off) | ((1UL<<start_off)-1);
18212 +       } else {
18213 +               contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
18214 +               while ( ++curr_idx != end_idx )
18215 +                       contiguous_bitmap[curr_idx] = 0;
18216 +               contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
18217 +       }
18218 +}
18219 +
18220 +/* Protected by balloon_lock. */
18221 +#define MAX_CONTIG_ORDER 9 /* 2MB */
18222 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
18223 +static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
18224 +
18225 +/* Ensure multi-page extents are contiguous in machine memory. */
18226 +int xen_create_contiguous_region(
18227 +       unsigned long vstart, unsigned int order, unsigned int address_bits)
18228 +{
18229 +       unsigned long *in_frames = discontig_frames, out_frame;
18230 +       unsigned long  frame, i, flags;
18231 +       long           rc;
18232 +       int            success;
18233 +       struct xen_memory_exchange exchange = {
18234 +               .in = {
18235 +                       .nr_extents   = 1UL << order,
18236 +                       .extent_order = 0,
18237 +                       .domid        = DOMID_SELF
18238 +               },
18239 +               .out = {
18240 +                       .nr_extents   = 1,
18241 +                       .extent_order = order,
18242 +                       .address_bits = address_bits,
18243 +                       .domid        = DOMID_SELF
18244 +               }
18245 +       };
18246 +
18247 +       /*
18248 +        * Currently an auto-translated guest will not perform I/O, nor will
18249 +        * it require PAE page directories below 4GB. Therefore any calls to
18250 +        * this function are redundant and can be ignored.
18251 +        */
18252 +       if (xen_feature(XENFEAT_auto_translated_physmap))
18253 +               return 0;
18254 +
18255 +       if (unlikely(order > MAX_CONTIG_ORDER))
18256 +               return -ENOMEM;
18257 +
18258 +       set_xen_guest_handle(exchange.in.extent_start, in_frames);
18259 +       set_xen_guest_handle(exchange.out.extent_start, &out_frame);
18260 +
18261 +       scrub_pages(vstart, 1 << order);
18262 +
18263 +       balloon_lock(flags);
18264 +
18265 +       /* 1. Zap current PTEs, remembering MFNs. */
18266 +       for (i = 0; i < (1UL<<order); i++) {
18267 +               in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
18268 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
18269 +                                       __pte_ma(0), 0);
18270 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
18271 +                       INVALID_P2M_ENTRY);
18272 +       }
18273 +       if (HYPERVISOR_multicall(cr_mcl, i))
18274 +               BUG();
18275 +
18276 +       /* 2. Get a new contiguous memory extent. */
18277 +       out_frame = __pa(vstart) >> PAGE_SHIFT;
18278 +       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
18279 +       success = (exchange.nr_exchanged == (1UL << order));
18280 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
18281 +       BUG_ON(success && (rc != 0));
18282 +#ifdef CONFIG_XEN_COMPAT_030002
18283 +       if (unlikely(rc == -ENOSYS)) {
18284 +               /* Compatibility when XENMEM_exchange is unsupported. */
18285 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
18286 +                                        &exchange.in) != (1UL << order))
18287 +                       BUG();
18288 +               success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18289 +                                               &exchange.out) == 1);
18290 +               if (!success) {
18291 +                       /* Couldn't get special memory: fall back to normal. */
18292 +                       for (i = 0; i < (1UL<<order); i++)
18293 +                               in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
18294 +                       if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18295 +                                                &exchange.in) != (1UL<<order))
18296 +                               BUG();
18297 +               }
18298 +       }
18299 +#endif
18300 +
18301 +       /* 3. Map the new extent in place of old pages. */
18302 +       for (i = 0; i < (1UL<<order); i++) {
18303 +               frame = success ? (out_frame + i) : in_frames[i];
18304 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
18305 +                                       pfn_pte_ma(frame, PAGE_KERNEL), 0);
18306 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
18307 +       }
18308 +
18309 +       cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
18310 +                                                  ? UVMF_TLB_FLUSH|UVMF_ALL
18311 +                                                  : UVMF_INVLPG|UVMF_ALL;
18312 +       if (HYPERVISOR_multicall(cr_mcl, i))
18313 +               BUG();
18314 +
18315 +       if (success)
18316 +               contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT,
18317 +                                     1UL << order);
18318 +
18319 +       balloon_unlock(flags);
18320 +
18321 +       return success ? 0 : -ENOMEM;
18322 +}
18323 +
18324 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
18325 +{
18326 +       unsigned long *out_frames = discontig_frames, in_frame;
18327 +       unsigned long  frame, i, flags;
18328 +       long           rc;
18329 +       int            success;
18330 +       struct xen_memory_exchange exchange = {
18331 +               .in = {
18332 +                       .nr_extents   = 1,
18333 +                       .extent_order = order,
18334 +                       .domid        = DOMID_SELF
18335 +               },
18336 +               .out = {
18337 +                       .nr_extents   = 1UL << order,
18338 +                       .extent_order = 0,
18339 +                       .domid        = DOMID_SELF
18340 +               }
18341 +       };
18342 +
18343 +       if (xen_feature(XENFEAT_auto_translated_physmap) ||
18344 +           !test_bit(__pa(vstart) >> PAGE_SHIFT, contiguous_bitmap))
18345 +               return;
18346 +
18347 +       if (unlikely(order > MAX_CONTIG_ORDER))
18348 +               return;
18349 +
18350 +       set_xen_guest_handle(exchange.in.extent_start, &in_frame);
18351 +       set_xen_guest_handle(exchange.out.extent_start, out_frames);
18352 +
18353 +       scrub_pages(vstart, 1 << order);
18354 +
18355 +       balloon_lock(flags);
18356 +
18357 +       contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
18358 +
18359 +       /* 1. Find start MFN of contiguous extent. */
18360 +       in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
18361 +
18362 +       /* 2. Zap current PTEs. */
18363 +       for (i = 0; i < (1UL<<order); i++) {
18364 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
18365 +                                       __pte_ma(0), 0);
18366 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
18367 +                       INVALID_P2M_ENTRY);
18368 +               out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
18369 +       }
18370 +       if (HYPERVISOR_multicall(cr_mcl, i))
18371 +               BUG();
18372 +
18373 +       /* 3. Do the exchange for non-contiguous MFNs. */
18374 +       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
18375 +       success = (exchange.nr_exchanged == 1);
18376 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
18377 +       BUG_ON(success && (rc != 0));
18378 +#ifdef CONFIG_XEN_COMPAT_030002
18379 +       if (unlikely(rc == -ENOSYS)) {
18380 +               /* Compatibility when XENMEM_exchange is unsupported. */
18381 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
18382 +                                        &exchange.in) != 1)
18383 +                       BUG();
18384 +               if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18385 +                                        &exchange.out) != (1UL << order))
18386 +                       BUG();
18387 +               success = 1;
18388 +       }
18389 +#endif
18390 +
18391 +       /* 4. Map new pages in place of old pages. */
18392 +       for (i = 0; i < (1UL<<order); i++) {
18393 +               frame = success ? out_frames[i] : (in_frame + i);
18394 +               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
18395 +                                       pfn_pte_ma(frame, PAGE_KERNEL), 0);
18396 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
18397 +       }
18398 +
18399 +       cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
18400 +                                                  ? UVMF_TLB_FLUSH|UVMF_ALL
18401 +                                                  : UVMF_INVLPG|UVMF_ALL;
18402 +       if (HYPERVISOR_multicall(cr_mcl, i))
18403 +               BUG();
18404 +
18405 +       balloon_unlock(flags);
18406 +}
18407 +
18408 +#ifdef __i386__
18409 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
18410 +{
18411 +       __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
18412 +       maddr_t mach_lp = arbitrary_virt_to_machine(lp);
18413 +       return HYPERVISOR_update_descriptor(
18414 +               mach_lp, (u64)entry_a | ((u64)entry_b<<32));
18415 +}
18416 +#endif
18417 diff -ruNp linux-2.6.19/arch/i386/mm/init-xen.c linux-2.6.19-xen-3.0.4/arch/i386/mm/init-xen.c
18418 --- linux-2.6.19/arch/i386/mm/init-xen.c        1970-01-01 00:00:00.000000000 +0000
18419 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/init-xen.c      2007-02-02 19:10:21.000000000 +0000
18420 @@ -0,0 +1,879 @@
18421 +/*
18422 + *  linux/arch/i386/mm/init.c
18423 + *
18424 + *  Copyright (C) 1995  Linus Torvalds
18425 + *
18426 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
18427 + */
18428 +
18429 +#include <linux/module.h>
18430 +#include <linux/signal.h>
18431 +#include <linux/sched.h>
18432 +#include <linux/kernel.h>
18433 +#include <linux/errno.h>
18434 +#include <linux/string.h>
18435 +#include <linux/types.h>
18436 +#include <linux/ptrace.h>
18437 +#include <linux/mman.h>
18438 +#include <linux/mm.h>
18439 +#include <linux/hugetlb.h>
18440 +#include <linux/swap.h>
18441 +#include <linux/smp.h>
18442 +#include <linux/init.h>
18443 +#include <linux/highmem.h>
18444 +#include <linux/pagemap.h>
18445 +#include <linux/poison.h>
18446 +#include <linux/bootmem.h>
18447 +#include <linux/slab.h>
18448 +#include <linux/proc_fs.h>
18449 +#include <linux/efi.h>
18450 +#include <linux/memory_hotplug.h>
18451 +#include <linux/initrd.h>
18452 +#include <linux/cpumask.h>
18453 +#include <linux/dma-mapping.h>
18454 +#include <linux/scatterlist.h>
18455 +
18456 +#include <asm/processor.h>
18457 +#include <asm/system.h>
18458 +#include <asm/uaccess.h>
18459 +#include <asm/pgtable.h>
18460 +#include <asm/dma.h>
18461 +#include <asm/fixmap.h>
18462 +#include <asm/e820.h>
18463 +#include <asm/apic.h>
18464 +#include <asm/tlb.h>
18465 +#include <asm/tlbflush.h>
18466 +#include <asm/sections.h>
18467 +#include <asm/hypervisor.h>
18468 +#include <asm/swiotlb.h>
18469 +
18470 +extern unsigned long *contiguous_bitmap;
18471 +
18472 +unsigned int __VMALLOC_RESERVE = 128 << 20;
18473 +
18474 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18475 +unsigned long highstart_pfn, highend_pfn;
18476 +
18477 +static int noinline do_test_wp_bit(void);
18478 +
18479 +/*
18480 + * Creates a middle page table and puts a pointer to it in the
18481 + * given global directory entry. This only returns the gd entry
18482 + * in non-PAE compilation mode, since the middle layer is folded.
18483 + */
18484 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
18485 +{
18486 +       pud_t *pud;
18487 +       pmd_t *pmd_table;
18488 +               
18489 +#ifdef CONFIG_X86_PAE
18490 +       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18491 +       make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18492 +       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18493 +       pud = pud_offset(pgd, 0);
18494 +       if (pmd_table != pmd_offset(pud, 0)) 
18495 +               BUG();
18496 +#else
18497 +       pud = pud_offset(pgd, 0);
18498 +       pmd_table = pmd_offset(pud, 0);
18499 +#endif
18500 +
18501 +       return pmd_table;
18502 +}
18503 +
18504 +/*
18505 + * Create a page table and place a pointer to it in a middle page
18506 + * directory entry.
18507 + */
18508 +static pte_t * __init one_page_table_init(pmd_t *pmd)
18509 +{
18510 +       if (pmd_none(*pmd)) {
18511 +               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18512 +               make_lowmem_page_readonly(page_table,
18513 +                                         XENFEAT_writable_page_tables);
18514 +               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
18515 +               if (page_table != pte_offset_kernel(pmd, 0))
18516 +                       BUG();  
18517 +
18518 +               return page_table;
18519 +       }
18520 +       
18521 +       return pte_offset_kernel(pmd, 0);
18522 +}
18523 +
18524 +/*
18525 + * This function initializes a certain range of kernel virtual memory 
18526 + * with new bootmem page tables, everywhere page tables are missing in
18527 + * the given range.
18528 + */
18529 +
18530 +/*
18531 + * NOTE: The pagetables are allocated contiguous on the physical space 
18532 + * so we can cache the place of the first one and move around without 
18533 + * checking the pgd every time.
18534 + */
18535 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
18536 +{
18537 +       pgd_t *pgd;
18538 +       pud_t *pud;
18539 +       pmd_t *pmd;
18540 +       int pgd_idx, pmd_idx;
18541 +       unsigned long vaddr;
18542 +
18543 +       vaddr = start;
18544 +       pgd_idx = pgd_index(vaddr);
18545 +       pmd_idx = pmd_index(vaddr);
18546 +       pgd = pgd_base + pgd_idx;
18547 +
18548 +       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
18549 +               if (pgd_none(*pgd)) 
18550 +                       one_md_table_init(pgd);
18551 +               pud = pud_offset(pgd, vaddr);
18552 +               pmd = pmd_offset(pud, vaddr);
18553 +               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
18554 +                       if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
18555 +                               one_page_table_init(pmd);
18556 +
18557 +                       vaddr += PMD_SIZE;
18558 +               }
18559 +               pmd_idx = 0;
18560 +       }
18561 +}
18562 +
18563 +static inline int is_kernel_text(unsigned long addr)
18564 +{
18565 +       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
18566 +               return 1;
18567 +       return 0;
18568 +}
18569 +
18570 +/*
18571 + * This maps the physical memory to kernel virtual address space, a total 
18572 + * of max_low_pfn pages, by creating page tables starting from address 
18573 + * PAGE_OFFSET.
18574 + */
18575 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18576 +{
18577 +       unsigned long pfn;
18578 +       pgd_t *pgd;
18579 +       pmd_t *pmd;
18580 +       pte_t *pte;
18581 +       int pgd_idx, pmd_idx, pte_ofs;
18582 +
18583 +       unsigned long max_ram_pfn = xen_start_info->nr_pages;
18584 +       if (max_ram_pfn > max_low_pfn)
18585 +               max_ram_pfn = max_low_pfn;
18586 +
18587 +       pgd_idx = pgd_index(PAGE_OFFSET);
18588 +       pgd = pgd_base + pgd_idx;
18589 +       pfn = 0;
18590 +       pmd_idx = pmd_index(PAGE_OFFSET);
18591 +       pte_ofs = pte_index(PAGE_OFFSET);
18592 +
18593 +       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18594 +#ifdef CONFIG_XEN
18595 +               /*
18596 +                * Native linux hasn't PAE-paging enabled yet at this
18597 +                * point.  When running as xen domain we are in PAE
18598 +                * mode already, thus we can't simply hook a empty
18599 +                * pmd.  That would kill the mappings we are currently
18600 +                * using ...
18601 +                */
18602 +               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
18603 +#else
18604 +               pmd = one_md_table_init(pgd);
18605 +#endif
18606 +               if (pfn >= max_low_pfn)
18607 +                       continue;
18608 +               pmd += pmd_idx;
18609 +               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
18610 +                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
18611 +                       if (address >= hypervisor_virt_start)
18612 +                               continue;
18613 +
18614 +                       /* Map with big pages if possible, otherwise create normal page tables. */
18615 +                       if (cpu_has_pse) {
18616 +                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
18617 +
18618 +                               if (is_kernel_text(address) || is_kernel_text(address2))
18619 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
18620 +                               else
18621 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
18622 +                               pfn += PTRS_PER_PTE;
18623 +                       } else {
18624 +                               pte = one_page_table_init(pmd);
18625 +
18626 +                               pte += pte_ofs;
18627 +                               for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
18628 +                                               /* XEN: Only map initial RAM allocation. */
18629 +                                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
18630 +                                                       continue;
18631 +                                               if (is_kernel_text(address))
18632 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
18633 +                                               else
18634 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
18635 +                               }
18636 +                               pte_ofs = 0;
18637 +                       }
18638 +               }
18639 +               pmd_idx = 0;
18640 +       }
18641 +}
18642 +
18643 +#ifndef CONFIG_XEN
18644 +
18645 +static inline int page_kills_ppro(unsigned long pagenr)
18646 +{
18647 +       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18648 +               return 1;
18649 +       return 0;
18650 +}
18651 +
18652 +#else
18653 +
18654 +#define page_kills_ppro(p)     0
18655 +
18656 +#endif
18657 +
18658 +extern int is_available_memory(efi_memory_desc_t *);
18659 +
18660 +int page_is_ram(unsigned long pagenr)
18661 +{
18662 +       int i;
18663 +       unsigned long addr, end;
18664 +
18665 +       if (efi_enabled) {
18666 +               efi_memory_desc_t *md;
18667 +               void *p;
18668 +
18669 +               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
18670 +                       md = p;
18671 +                       if (!is_available_memory(md))
18672 +                               continue;
18673 +                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
18674 +                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
18675 +
18676 +                       if ((pagenr >= addr) && (pagenr < end))
18677 +                               return 1;
18678 +               }
18679 +               return 0;
18680 +       }
18681 +
18682 +       for (i = 0; i < e820.nr_map; i++) {
18683 +
18684 +               if (e820.map[i].type != E820_RAM)       /* not usable memory */
18685 +                       continue;
18686 +               /*
18687 +                *      !!!FIXME!!! Some BIOSen report areas as RAM that
18688 +                *      are not. Notably the 640->1Mb area. We need a sanity
18689 +                *      check here.
18690 +                */
18691 +               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
18692 +               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
18693 +               if  ((pagenr >= addr) && (pagenr < end))
18694 +                       return 1;
18695 +       }
18696 +       return 0;
18697 +}
18698 +
18699 +#ifdef CONFIG_HIGHMEM
18700 +pte_t *kmap_pte;
18701 +pgprot_t kmap_prot;
18702 +
18703 +#define kmap_get_fixmap_pte(vaddr)                                     \
18704 +       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
18705 +
18706 +static void __init kmap_init(void)
18707 +{
18708 +       unsigned long kmap_vstart;
18709 +
18710 +       /* cache the first kmap pte */
18711 +       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
18712 +       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
18713 +
18714 +       kmap_prot = PAGE_KERNEL;
18715 +}
18716 +
18717 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
18718 +{
18719 +       pgd_t *pgd;
18720 +       pud_t *pud;
18721 +       pmd_t *pmd;
18722 +       pte_t *pte;
18723 +       unsigned long vaddr;
18724 +
18725 +       vaddr = PKMAP_BASE;
18726 +       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
18727 +
18728 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18729 +       pud = pud_offset(pgd, vaddr);
18730 +       pmd = pmd_offset(pud, vaddr);
18731 +       pte = pte_offset_kernel(pmd, vaddr);
18732 +       pkmap_page_table = pte; 
18733 +}
18734 +
18735 +static void __meminit free_new_highpage(struct page *page, int pfn)
18736 +{
18737 +       init_page_count(page);
18738 +       if (pfn < xen_start_info->nr_pages)
18739 +               __free_page(page);
18740 +       totalhigh_pages++;
18741 +}
18742 +
18743 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18744 +{
18745 +       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18746 +               ClearPageReserved(page);
18747 +               free_new_highpage(page, pfn);
18748 +       } else
18749 +               SetPageReserved(page);
18750 +}
18751 +
18752 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
18753 +{
18754 +       free_new_highpage(page, pfn);
18755 +       totalram_pages++;
18756 +#ifdef CONFIG_FLATMEM
18757 +       max_mapnr = max(pfn, max_mapnr);
18758 +#endif
18759 +       num_physpages++;
18760 +       return 0;
18761 +}
18762 +
18763 +/*
18764 + * Not currently handling the NUMA case.
18765 + * Assuming single node and all memory that
18766 + * has been added dynamically that would be
18767 + * onlined here is in HIGHMEM
18768 + */
18769 +void online_page(struct page *page)
18770 +{
18771 +       ClearPageReserved(page);
18772 +       add_one_highpage_hotplug(page, page_to_pfn(page));
18773 +}
18774 +
18775 +
18776 +#ifdef CONFIG_NUMA
18777 +extern void set_highmem_pages_init(int);
18778 +#else
18779 +static void __init set_highmem_pages_init(int bad_ppro)
18780 +{
18781 +       int pfn;
18782 +       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
18783 +               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18784 +       totalram_pages += totalhigh_pages;
18785 +}
18786 +#endif /* CONFIG_FLATMEM */
18787 +
18788 +#else
18789 +#define kmap_init() do { } while (0)
18790 +#define permanent_kmaps_init(pgd_base) do { } while (0)
18791 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
18792 +#endif /* CONFIG_HIGHMEM */
18793 +
18794 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
18795 +EXPORT_SYMBOL(__PAGE_KERNEL);
18796 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18797 +
18798 +#ifdef CONFIG_NUMA
18799 +extern void __init remap_numa_kva(void);
18800 +#else
18801 +#define remap_numa_kva() do {} while (0)
18802 +#endif
18803 +
18804 +pgd_t *swapper_pg_dir;
18805 +
18806 +static void __init pagetable_init (void)
18807 +{
18808 +       unsigned long vaddr;
18809 +       pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18810 +
18811 +       swapper_pg_dir = pgd_base;
18812 +       init_mm.pgd    = pgd_base;
18813 +
18814 +       /* Enable PSE if available */
18815 +       if (cpu_has_pse) {
18816 +               set_in_cr4(X86_CR4_PSE);
18817 +       }
18818 +
18819 +       /* Enable PGE if available */
18820 +       if (cpu_has_pge) {
18821 +               set_in_cr4(X86_CR4_PGE);
18822 +               __PAGE_KERNEL |= _PAGE_GLOBAL;
18823 +               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18824 +       }
18825 +
18826 +       kernel_physical_mapping_init(pgd_base);
18827 +       remap_numa_kva();
18828 +
18829 +       /*
18830 +        * Fixed mappings, only the page table structure has to be
18831 +        * created - mappings will be set by set_fixmap():
18832 +        */
18833 +       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
18834 +       page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
18835 +
18836 +       permanent_kmaps_init(pgd_base);
18837 +}
18838 +
18839 +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
18840 +/*
18841 + * Swap suspend & friends need this for resume because things like the intel-agp
18842 + * driver might have split up a kernel 4MB mapping.
18843 + */
18844 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
18845 +       __attribute__ ((aligned (PAGE_SIZE)));
18846 +
18847 +static inline void save_pg_dir(void)
18848 +{
18849 +       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
18850 +}
18851 +#else
18852 +static inline void save_pg_dir(void)
18853 +{
18854 +}
18855 +#endif
18856 +
18857 +void zap_low_mappings (void)
18858 +{
18859 +       int i;
18860 +
18861 +       save_pg_dir();
18862 +
18863 +       /*
18864 +        * Zap initial low-memory mappings.
18865 +        *
18866 +        * Note that "pgd_clear()" doesn't do it for
18867 +        * us, because pgd_clear() is a no-op on i386.
18868 +        */
18869 +       for (i = 0; i < USER_PTRS_PER_PGD; i++)
18870 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
18871 +               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
18872 +#else
18873 +               set_pgd(swapper_pg_dir+i, __pgd(0));
18874 +#endif
18875 +       flush_tlb_all();
18876 +}
18877 +
18878 +static int disable_nx __initdata = 0;
18879 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
18880 +EXPORT_SYMBOL(__supported_pte_mask);
18881 +
18882 +/*
18883 + * noexec = on|off
18884 + *
18885 + * Control non executable mappings.
18886 + *
18887 + * on      Enable
18888 + * off     Disable
18889 + */
18890 +static int __init noexec_setup(char *str)
18891 +{
18892 +       if (!str || !strcmp(str, "on")) {
18893 +               if (cpu_has_nx) {
18894 +                       __supported_pte_mask |= _PAGE_NX;
18895 +                       disable_nx = 0;
18896 +               }
18897 +       } else if (!strcmp(str,"off")) {
18898 +               disable_nx = 1;
18899 +               __supported_pte_mask &= ~_PAGE_NX;
18900 +       } else
18901 +               return -EINVAL;
18902 +
18903 +       return 0;
18904 +}
18905 +early_param("noexec", noexec_setup);
18906 +
18907 +int nx_enabled = 0;
18908 +#ifdef CONFIG_X86_PAE
18909 +
18910 +static void __init set_nx(void)
18911 +{
18912 +       unsigned int v[4], l, h;
18913 +
18914 +       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
18915 +               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
18916 +               if ((v[3] & (1 << 20)) && !disable_nx) {
18917 +                       rdmsr(MSR_EFER, l, h);
18918 +                       l |= EFER_NX;
18919 +                       wrmsr(MSR_EFER, l, h);
18920 +                       nx_enabled = 1;
18921 +                       __supported_pte_mask |= _PAGE_NX;
18922 +               }
18923 +       }
18924 +}
18925 +
18926 +/*
18927 + * Enables/disables executability of a given kernel page and
18928 + * returns the previous setting.
18929 + */
18930 +int __init set_kernel_exec(unsigned long vaddr, int enable)
18931 +{
18932 +       pte_t *pte;
18933 +       int ret = 1;
18934 +
18935 +       if (!nx_enabled)
18936 +               goto out;
18937 +
18938 +       pte = lookup_address(vaddr);
18939 +       BUG_ON(!pte);
18940 +
18941 +       if (!pte_exec_kernel(*pte))
18942 +               ret = 0;
18943 +
18944 +       if (enable)
18945 +               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
18946 +       else
18947 +               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
18948 +       pte_update_defer(&init_mm, vaddr, pte);
18949 +       __flush_tlb_all();
18950 +out:
18951 +       return ret;
18952 +}
18953 +
18954 +#endif
18955 +
18956 +/*
18957 + * paging_init() sets up the page tables - note that the first 8MB are
18958 + * already mapped by head.S.
18959 + *
18960 + * This routines also unmaps the page at virtual kernel address 0, so
18961 + * that we can trap those pesky NULL-reference errors in the kernel.
18962 + */
18963 +void __init paging_init(void)
18964 +{
18965 +       int i;
18966 +
18967 +#ifdef CONFIG_X86_PAE
18968 +       set_nx();
18969 +       if (nx_enabled)
18970 +               printk("NX (Execute Disable) protection: active\n");
18971 +#endif
18972 +
18973 +       pagetable_init();
18974 +
18975 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
18976 +       /*
18977 +        * We will bail out later - printk doesn't work right now so
18978 +        * the user would just see a hanging kernel.
18979 +        * when running as xen domain we are already in PAE mode at
18980 +        * this point.
18981 +        */
18982 +       if (cpu_has_pae)
18983 +               set_in_cr4(X86_CR4_PAE);
18984 +#endif
18985 +       __flush_tlb_all();
18986 +
18987 +       kmap_init();
18988 +
18989 +       /* Switch to the real shared_info page, and clear the
18990 +        * dummy page. */
18991 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18992 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18993 +       memset(empty_zero_page, 0, sizeof(empty_zero_page));
18994 +
18995 +       /* Setup mapping of lower 1st MB */
18996 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
18997 +               if (is_initial_xendomain())
18998 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18999 +               else
19000 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
19001 +                                    virt_to_machine(empty_zero_page),
19002 +                                    PAGE_KERNEL_RO);
19003 +}
19004 +
19005 +/*
19006 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
19007 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
19008 + * used to involve black magic jumps to work around some nasty CPU bugs,
19009 + * but fortunately the switch to using exceptions got rid of all that.
19010 + */
19011 +
19012 +static void __init test_wp_bit(void)
19013 +{
19014 +       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
19015 +
19016 +       /* Any page-aligned address will do, the test is non-destructive */
19017 +       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
19018 +       boot_cpu_data.wp_works_ok = do_test_wp_bit();
19019 +       clear_fixmap(FIX_WP_TEST);
19020 +
19021 +       if (!boot_cpu_data.wp_works_ok) {
19022 +               printk("No.\n");
19023 +#ifdef CONFIG_X86_WP_WORKS_OK
19024 +               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
19025 +#endif
19026 +       } else {
19027 +               printk("Ok.\n");
19028 +       }
19029 +}
19030 +
19031 +static struct kcore_list kcore_mem, kcore_vmalloc; 
19032 +
19033 +void __init mem_init(void)
19034 +{
19035 +       extern int ppro_with_ram_bug(void);
19036 +       int codesize, reservedpages, datasize, initsize;
19037 +       int tmp;
19038 +       int bad_ppro;
19039 +       unsigned long pfn;
19040 +
19041 +       contiguous_bitmap = alloc_bootmem_low_pages(
19042 +               (max_low_pfn + 2*BITS_PER_LONG) >> 3);
19043 +       BUG_ON(!contiguous_bitmap);
19044 +       memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3);
19045 +
19046 +#if defined(CONFIG_SWIOTLB)
19047 +       swiotlb_init(); 
19048 +#endif
19049 +
19050 +#ifdef CONFIG_FLATMEM
19051 +       BUG_ON(!mem_map);
19052 +#endif
19053 +       
19054 +       bad_ppro = ppro_with_ram_bug();
19055 +
19056 +#ifdef CONFIG_HIGHMEM
19057 +       /* check that fixmap and pkmap do not overlap */
19058 +       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
19059 +               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
19060 +               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
19061 +                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
19062 +               BUG();
19063 +       }
19064 +#endif
19065
19066 +       printk("vmalloc area: %lx-%lx, maxmem %lx\n",
19067 +              VMALLOC_START,VMALLOC_END,MAXMEM);
19068 +       BUG_ON(VMALLOC_START > VMALLOC_END);
19069 +       
19070 +       /* this will put all low memory onto the freelists */
19071 +       totalram_pages += free_all_bootmem();
19072 +       /* XEN: init and count low-mem pages outside initial allocation. */
19073 +       for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
19074 +               ClearPageReserved(pfn_to_page(pfn));
19075 +               init_page_count(pfn_to_page(pfn));
19076 +               totalram_pages++;
19077 +       }
19078 +
19079 +       reservedpages = 0;
19080 +       for (tmp = 0; tmp < max_low_pfn; tmp++)
19081 +               /*
19082 +                * Only count reserved RAM pages
19083 +                */
19084 +               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
19085 +                       reservedpages++;
19086 +
19087 +       set_highmem_pages_init(bad_ppro);
19088 +
19089 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
19090 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
19091 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
19092 +
19093 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
19094 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
19095 +                  VMALLOC_END-VMALLOC_START);
19096 +
19097 +       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
19098 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
19099 +               num_physpages << (PAGE_SHIFT-10),
19100 +               codesize >> 10,
19101 +               reservedpages << (PAGE_SHIFT-10),
19102 +               datasize >> 10,
19103 +               initsize >> 10,
19104 +               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
19105 +              );
19106 +
19107 +#if 1 /* double-sanity-check paranoia */
19108 +       printk("virtual kernel memory layout:\n"
19109 +              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
19110 +#ifdef CONFIG_HIGHMEM
19111 +              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
19112 +#endif
19113 +              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
19114 +              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
19115 +              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
19116 +              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
19117 +              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
19118 +              FIXADDR_START, FIXADDR_TOP,
19119 +              (FIXADDR_TOP - FIXADDR_START) >> 10,
19120 +
19121 +#ifdef CONFIG_HIGHMEM
19122 +              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
19123 +              (LAST_PKMAP*PAGE_SIZE) >> 10,
19124 +#endif
19125 +
19126 +              VMALLOC_START, VMALLOC_END,
19127 +              (VMALLOC_END - VMALLOC_START) >> 20,
19128 +
19129 +              (unsigned long)__va(0), (unsigned long)high_memory,
19130 +              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
19131 +
19132 +              (unsigned long)&__init_begin, (unsigned long)&__init_end,
19133 +              ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
19134 +
19135 +              (unsigned long)&_etext, (unsigned long)&_edata,
19136 +              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
19137 +
19138 +              (unsigned long)&_text, (unsigned long)&_etext,
19139 +              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
19140 +
19141 +#ifdef CONFIG_HIGHMEM
19142 +       BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
19143 +       BUG_ON(VMALLOC_END                     > PKMAP_BASE);
19144 +#endif
19145 +       BUG_ON(VMALLOC_START                   > VMALLOC_END);
19146 +       BUG_ON((unsigned long)high_memory      > VMALLOC_START);
19147 +#endif /* double-sanity-check paranoia */
19148 +
19149 +#ifdef CONFIG_X86_PAE
19150 +       if (!cpu_has_pae)
19151 +               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
19152 +#endif
19153 +       if (boot_cpu_data.wp_works_ok < 0)
19154 +               test_wp_bit();
19155 +
19156 +       /*
19157 +        * Subtle. SMP is doing it's boot stuff late (because it has to
19158 +        * fork idle threads) - but it also needs low mappings for the
19159 +        * protected-mode entry to work. We zap these entries only after
19160 +        * the WP-bit has been tested.
19161 +        */
19162 +#ifndef CONFIG_SMP
19163 +       zap_low_mappings();
19164 +#endif
19165 +
19166 +       set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
19167 +}
19168 +
19169 +/*
19170 + * this is for the non-NUMA, single node SMP system case.
19171 + * Specifically, in the case of x86, we will always add
19172 + * memory to the highmem for now.
19173 + */
19174 +#ifdef CONFIG_MEMORY_HOTPLUG
19175 +#ifndef CONFIG_NEED_MULTIPLE_NODES
19176 +int arch_add_memory(int nid, u64 start, u64 size)
19177 +{
19178 +       struct pglist_data *pgdata = &contig_page_data;
19179 +       struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
19180 +       unsigned long start_pfn = start >> PAGE_SHIFT;
19181 +       unsigned long nr_pages = size >> PAGE_SHIFT;
19182 +
19183 +       return __add_pages(zone, start_pfn, nr_pages);
19184 +}
19185 +
19186 +int remove_memory(u64 start, u64 size)
19187 +{
19188 +       return -EINVAL;
19189 +}
19190 +#endif
19191 +#endif
19192 +
19193 +kmem_cache_t *pgd_cache;
19194 +kmem_cache_t *pmd_cache;
19195 +
19196 +void __init pgtable_cache_init(void)
19197 +{
19198 +       if (PTRS_PER_PMD > 1) {
19199 +               pmd_cache = kmem_cache_create("pmd",
19200 +                                       PTRS_PER_PMD*sizeof(pmd_t),
19201 +                                       PTRS_PER_PMD*sizeof(pmd_t),
19202 +                                       0,
19203 +                                       pmd_ctor,
19204 +                                       NULL);
19205 +               if (!pmd_cache)
19206 +                       panic("pgtable_cache_init(): cannot create pmd cache");
19207 +       }
19208 +       pgd_cache = kmem_cache_create("pgd",
19209 +#ifndef CONFIG_XEN
19210 +                               PTRS_PER_PGD*sizeof(pgd_t),
19211 +                               PTRS_PER_PGD*sizeof(pgd_t),
19212 +#else
19213 +                               PAGE_SIZE,
19214 +                               PAGE_SIZE,
19215 +#endif
19216 +                               0,
19217 +                               pgd_ctor,
19218 +                               PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
19219 +       if (!pgd_cache)
19220 +               panic("pgtable_cache_init(): Cannot create pgd cache");
19221 +}
19222 +
19223 +/*
19224 + * This function cannot be __init, since exceptions don't work in that
19225 + * section.  Put this after the callers, so that it cannot be inlined.
19226 + */
19227 +static int noinline do_test_wp_bit(void)
19228 +{
19229 +       char tmp_reg;
19230 +       int flag;
19231 +
19232 +       __asm__ __volatile__(
19233 +               "       movb %0,%1      \n"
19234 +               "1:     movb %1,%0      \n"
19235 +               "       xorl %2,%2      \n"
19236 +               "2:                     \n"
19237 +               ".section __ex_table,\"a\"\n"
19238 +               "       .align 4        \n"
19239 +               "       .long 1b,2b     \n"
19240 +               ".previous              \n"
19241 +               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
19242 +                "=q" (tmp_reg),
19243 +                "=r" (flag)
19244 +               :"2" (1)
19245 +               :"memory");
19246 +       
19247 +       return flag;
19248 +}
19249 +
19250 +#ifdef CONFIG_DEBUG_RODATA
19251 +
19252 +void mark_rodata_ro(void)
19253 +{
19254 +       unsigned long addr = (unsigned long)__start_rodata;
19255 +
19256 +       for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
19257 +               change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
19258 +
19259 +       printk("Write protecting the kernel read-only data: %uk\n",
19260 +                       (__end_rodata - __start_rodata) >> 10);
19261 +
19262 +       /*
19263 +        * change_page_attr() requires a global_flush_tlb() call after it.
19264 +        * We do this after the printk so that if something went wrong in the
19265 +        * change, the printk gets out at least to give a better debug hint
19266 +        * of who is the culprit.
19267 +        */
19268 +       global_flush_tlb();
19269 +}
19270 +#endif
19271 +
19272 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
19273 +{
19274 +       unsigned long addr;
19275 +
19276 +       for (addr = begin; addr < end; addr += PAGE_SIZE) {
19277 +               ClearPageReserved(virt_to_page(addr));
19278 +               init_page_count(virt_to_page(addr));
19279 +               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
19280 +               free_page(addr);
19281 +               totalram_pages++;
19282 +       }
19283 +       printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
19284 +}
19285 +
19286 +void free_initmem(void)
19287 +{
19288 +       free_init_pages("unused kernel memory",
19289 +                       (unsigned long)(&__init_begin),
19290 +                       (unsigned long)(&__init_end));
19291 +}
19292 +
19293 +#ifdef CONFIG_BLK_DEV_INITRD
19294 +void free_initrd_mem(unsigned long start, unsigned long end)
19295 +{
19296 +       free_init_pages("initrd memory", start, end);
19297 +}
19298 +#endif
19299 +
19300 diff -ruNp linux-2.6.19/arch/i386/mm/ioremap-xen.c linux-2.6.19-xen-3.0.4/arch/i386/mm/ioremap-xen.c
19301 --- linux-2.6.19/arch/i386/mm/ioremap-xen.c     1970-01-01 00:00:00.000000000 +0000
19302 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/ioremap-xen.c   2007-02-02 19:10:21.000000000 +0000
19303 @@ -0,0 +1,443 @@
19304 +/*
19305 + * arch/i386/mm/ioremap.c
19306 + *
19307 + * Re-map IO memory to kernel address space so that we can access it.
19308 + * This is needed for high PCI addresses that aren't mapped in the
19309 + * 640k-1MB IO memory area on PC's
19310 + *
19311 + * (C) Copyright 1995 1996 Linus Torvalds
19312 + */
19313 +
19314 +#include <linux/vmalloc.h>
19315 +#include <linux/init.h>
19316 +#include <linux/slab.h>
19317 +#include <linux/module.h>
19318 +#include <asm/io.h>
19319 +#include <asm/fixmap.h>
19320 +#include <asm/cacheflush.h>
19321 +#include <asm/tlbflush.h>
19322 +#include <asm/pgtable.h>
19323 +#include <asm/pgalloc.h>
19324 +
19325 +#define ISA_START_ADDRESS      0x0
19326 +#define ISA_END_ADDRESS                0x100000
19327 +
19328 +static int direct_remap_area_pte_fn(pte_t *pte, 
19329 +                                   struct page *pmd_page,
19330 +                                   unsigned long address, 
19331 +                                   void *data)
19332 +{
19333 +       mmu_update_t **v = (mmu_update_t **)data;
19334 +
19335 +       BUG_ON(!pte_none(*pte));
19336 +
19337 +       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
19338 +                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
19339 +       (*v)++;
19340 +
19341 +       return 0;
19342 +}
19343 +
19344 +static int __direct_remap_pfn_range(struct mm_struct *mm,
19345 +                                   unsigned long address, 
19346 +                                   unsigned long mfn,
19347 +                                   unsigned long size, 
19348 +                                   pgprot_t prot,
19349 +                                   domid_t  domid)
19350 +{
19351 +       int rc;
19352 +       unsigned long i, start_address;
19353 +       mmu_update_t *u, *v, *w;
19354 +
19355 +       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
19356 +       if (u == NULL)
19357 +               return -ENOMEM;
19358 +
19359 +       start_address = address;
19360 +
19361 +       flush_cache_all();
19362 +
19363 +       for (i = 0; i < size; i += PAGE_SIZE) {
19364 +               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
19365 +                       /* Flush a full batch after filling in the PTE ptrs. */
19366 +                       rc = apply_to_page_range(mm, start_address, 
19367 +                                                address - start_address,
19368 +                                                direct_remap_area_pte_fn, &w);
19369 +                       if (rc)
19370 +                               goto out;
19371 +                       rc = -EFAULT;
19372 +                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
19373 +                               goto out;
19374 +                       v = w = u;
19375 +                       start_address = address;
19376 +               }
19377 +
19378 +               /*
19379 +                * Fill in the machine address: PTE ptr is done later by
19380 +                * __direct_remap_area_pages(). 
19381 +                */
19382 +               v->val = pte_val_ma(pfn_pte_ma(mfn, prot));
19383 +
19384 +               mfn++;
19385 +               address += PAGE_SIZE; 
19386 +               v++;
19387 +       }
19388 +
19389 +       if (v != u) {
19390 +               /* Final batch. */
19391 +               rc = apply_to_page_range(mm, start_address,
19392 +                                        address - start_address,
19393 +                                        direct_remap_area_pte_fn, &w);
19394 +               if (rc)
19395 +                       goto out;
19396 +               rc = -EFAULT;
19397 +               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
19398 +                       goto out;
19399 +       }
19400 +
19401 +       rc = 0;
19402 +
19403 + out:
19404 +       flush_tlb_all();
19405 +
19406 +       free_page((unsigned long)u);
19407 +
19408 +       return rc;
19409 +}
19410 +
19411 +int direct_remap_pfn_range(struct vm_area_struct *vma,
19412 +                          unsigned long address, 
19413 +                          unsigned long mfn,
19414 +                          unsigned long size, 
19415 +                          pgprot_t prot,
19416 +                          domid_t  domid)
19417 +{
19418 +       if (xen_feature(XENFEAT_auto_translated_physmap))
19419 +               return remap_pfn_range(vma, address, mfn, size, prot);
19420 +
19421 +       if (domid == DOMID_SELF)
19422 +               return -EINVAL;
19423 +
19424 +       vma->vm_flags |= VM_IO | VM_RESERVED;
19425 +
19426 +       vma->vm_mm->context.has_foreign_mappings = 1;
19427 +
19428 +       return __direct_remap_pfn_range(
19429 +               vma->vm_mm, address, mfn, size, prot, domid);
19430 +}
19431 +EXPORT_SYMBOL(direct_remap_pfn_range);
19432 +
19433 +int direct_kernel_remap_pfn_range(unsigned long address, 
19434 +                                 unsigned long mfn,
19435 +                                 unsigned long size, 
19436 +                                 pgprot_t prot,
19437 +                                 domid_t  domid)
19438 +{
19439 +       return __direct_remap_pfn_range(
19440 +               &init_mm, address, mfn, size, prot, domid);
19441 +}
19442 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
19443 +
19444 +static int lookup_pte_fn(
19445 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
19446 +{
19447 +       uint64_t *ptep = (uint64_t *)data;
19448 +       if (ptep)
19449 +               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
19450 +                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
19451 +       return 0;
19452 +}
19453 +
19454 +int create_lookup_pte_addr(struct mm_struct *mm, 
19455 +                          unsigned long address,
19456 +                          uint64_t *ptep)
19457 +{
19458 +       return apply_to_page_range(mm, address, PAGE_SIZE,
19459 +                                  lookup_pte_fn, ptep);
19460 +}
19461 +
19462 +EXPORT_SYMBOL(create_lookup_pte_addr);
19463 +
19464 +static int noop_fn(
19465 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
19466 +{
19467 +       return 0;
19468 +}
19469 +
19470 +int touch_pte_range(struct mm_struct *mm,
19471 +                   unsigned long address,
19472 +                   unsigned long size)
19473 +{
19474 +       return apply_to_page_range(mm, address, size, noop_fn, NULL);
19475 +} 
19476 +
19477 +EXPORT_SYMBOL(touch_pte_range);
19478 +
19479 +/*
19480 + * Does @address reside within a non-highmem page that is local to this virtual
19481 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
19482 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
19483 + * why this works.
19484 + */
19485 +static inline int is_local_lowmem(unsigned long address)
19486 +{
19487 +       extern unsigned long max_low_pfn;
19488 +       return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
19489 +}
19490 +
19491 +/*
19492 + * Generic mapping function (not visible outside):
19493 + */
19494 +
19495 +/*
19496 + * Remap an arbitrary physical address space into the kernel virtual
19497 + * address space. Needed when the kernel wants to access high addresses
19498 + * directly.
19499 + *
19500 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
19501 + * have to convert them into an offset in a page-aligned mapping, but the
19502 + * caller shouldn't need to know that small detail.
19503 + */
19504 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
19505 +{
19506 +       void __iomem * addr;
19507 +       struct vm_struct * area;
19508 +       unsigned long offset, last_addr;
19509 +       domid_t domid = DOMID_IO;
19510 +
19511 +       /* Don't allow wraparound or zero size */
19512 +       last_addr = phys_addr + size - 1;
19513 +       if (!size || last_addr < phys_addr)
19514 +               return NULL;
19515 +
19516 +       /*
19517 +        * Don't remap the low PCI/ISA area, it's always mapped..
19518 +        */
19519 +       if (is_initial_xendomain() &&
19520 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
19521 +               return (void __iomem *) isa_bus_to_virt(phys_addr);
19522 +
19523 +       /*
19524 +        * Don't allow anybody to remap normal RAM that we're using..
19525 +        */
19526 +       if (is_local_lowmem(phys_addr)) {
19527 +               char *t_addr, *t_end;
19528 +               struct page *page;
19529 +
19530 +               t_addr = bus_to_virt(phys_addr);
19531 +               t_end = t_addr + (size - 1);
19532 +          
19533 +               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
19534 +                       if(!PageReserved(page))
19535 +                               return NULL;
19536 +
19537 +               domid = DOMID_SELF;
19538 +       }
19539 +
19540 +       /*
19541 +        * Mappings have to be page-aligned
19542 +        */
19543 +       offset = phys_addr & ~PAGE_MASK;
19544 +       phys_addr &= PAGE_MASK;
19545 +       size = PAGE_ALIGN(last_addr+1) - phys_addr;
19546 +
19547 +       /*
19548 +        * Ok, go for it..
19549 +        */
19550 +       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
19551 +       if (!area)
19552 +               return NULL;
19553 +       area->phys_addr = phys_addr;
19554 +       addr = (void __iomem *) area->addr;
19555 +       flags |= _KERNPG_TABLE;
19556 +       if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
19557 +                                    phys_addr>>PAGE_SHIFT,
19558 +                                    size, __pgprot(flags), domid)) {
19559 +               vunmap((void __force *) addr);
19560 +               return NULL;
19561 +       }
19562 +       return (void __iomem *) (offset + (char __iomem *)addr);
19563 +}
19564 +EXPORT_SYMBOL(__ioremap);
19565 +
19566 +/**
19567 + * ioremap_nocache     -   map bus memory into CPU space
19568 + * @offset:    bus address of the memory
19569 + * @size:      size of the resource to map
19570 + *
19571 + * ioremap_nocache performs a platform specific sequence of operations to
19572 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
19573 + * writew/writel functions and the other mmio helpers. The returned
19574 + * address is not guaranteed to be usable directly as a virtual
19575 + * address. 
19576 + *
19577 + * This version of ioremap ensures that the memory is marked uncachable
19578 + * on the CPU as well as honouring existing caching rules from things like
19579 + * the PCI bus. Note that there are other caches and buffers on many 
19580 + * busses. In particular driver authors should read up on PCI writes
19581 + *
19582 + * It's useful if some control registers are in such an area and
19583 + * write combining or read caching is not desirable:
19584 + * 
19585 + * Must be freed with iounmap.
19586 + */
19587 +
19588 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
19589 +{
19590 +       unsigned long last_addr;
19591 +       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
19592 +       if (!p) 
19593 +               return p; 
19594 +
19595 +       /* Guaranteed to be > phys_addr, as per __ioremap() */
19596 +       last_addr = phys_addr + size - 1;
19597 +
19598 +       if (is_local_lowmem(last_addr)) { 
19599 +               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
19600 +               unsigned long npages;
19601 +
19602 +               phys_addr &= PAGE_MASK;
19603 +
19604 +               /* This might overflow and become zero.. */
19605 +               last_addr = PAGE_ALIGN(last_addr);
19606 +
19607 +               /* .. but that's ok, because modulo-2**n arithmetic will make
19608 +               * the page-aligned "last - first" come out right.
19609 +               */
19610 +               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
19611 +
19612 +               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
19613 +                       iounmap(p); 
19614 +                       p = NULL;
19615 +               }
19616 +               global_flush_tlb();
19617 +       }
19618 +
19619 +       return p;                                       
19620 +}
19621 +EXPORT_SYMBOL(ioremap_nocache);
19622 +
19623 +/**
19624 + * iounmap - Free a IO remapping
19625 + * @addr: virtual address from ioremap_*
19626 + *
19627 + * Caller must ensure there is only one unmapping for the same pointer.
19628 + */
19629 +void iounmap(volatile void __iomem *addr)
19630 +{
19631 +       struct vm_struct *p, *o;
19632 +
19633 +       if ((void __force *)addr <= high_memory)
19634 +               return;
19635 +
19636 +       /*
19637 +        * __ioremap special-cases the PCI/ISA range by not instantiating a
19638 +        * vm_area and by simply returning an address into the kernel mapping
19639 +        * of ISA space.   So handle that here.
19640 +        */
19641 +       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
19642 +               return;
19643 +
19644 +       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
19645 +
19646 +       /* Use the vm area unlocked, assuming the caller
19647 +          ensures there isn't another iounmap for the same address
19648 +          in parallel. Reuse of the virtual address is prevented by
19649 +          leaving it in the global lists until we're done with it.
19650 +          cpa takes care of the direct mappings. */
19651 +       read_lock(&vmlist_lock);
19652 +       for (p = vmlist; p; p = p->next) {
19653 +               if (p->addr == addr)
19654 +                       break;
19655 +       }
19656 +       read_unlock(&vmlist_lock);
19657 +
19658 +       if (!p) {
19659 +               printk("iounmap: bad address %p\n", addr);
19660 +               dump_stack();
19661 +               return;
19662 +       }
19663 +
19664 +       /* Reset the direct mapping. Can block */
19665 +       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
19666 +               /* p->size includes the guard page, but cpa doesn't like that */
19667 +               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
19668 +                                (p->size - PAGE_SIZE) >> PAGE_SHIFT,
19669 +                                PAGE_KERNEL);
19670 +               global_flush_tlb();
19671 +       } 
19672 +
19673 +       /* Finally remove it */
19674 +       o = remove_vm_area((void *)addr);
19675 +       BUG_ON(p != o || o == NULL);
19676 +       kfree(p); 
19677 +}
19678 +EXPORT_SYMBOL(iounmap);
19679 +
19680 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
19681 +{
19682 +       unsigned long offset, last_addr;
19683 +       unsigned int nrpages;
19684 +       enum fixed_addresses idx;
19685 +
19686 +       /* Don't allow wraparound or zero size */
19687 +       last_addr = phys_addr + size - 1;
19688 +       if (!size || last_addr < phys_addr)
19689 +               return NULL;
19690 +
19691 +       /*
19692 +        * Don't remap the low PCI/ISA area, it's always mapped..
19693 +        */
19694 +       if (is_initial_xendomain() &&
19695 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
19696 +               return isa_bus_to_virt(phys_addr);
19697 +
19698 +       /*
19699 +        * Mappings have to be page-aligned
19700 +        */
19701 +       offset = phys_addr & ~PAGE_MASK;
19702 +       phys_addr &= PAGE_MASK;
19703 +       size = PAGE_ALIGN(last_addr) - phys_addr;
19704 +
19705 +       /*
19706 +        * Mappings have to fit in the FIX_BTMAP area.
19707 +        */
19708 +       nrpages = size >> PAGE_SHIFT;
19709 +       if (nrpages > NR_FIX_BTMAPS)
19710 +               return NULL;
19711 +
19712 +       /*
19713 +        * Ok, go for it..
19714 +        */
19715 +       idx = FIX_BTMAP_BEGIN;
19716 +       while (nrpages > 0) {
19717 +               set_fixmap(idx, phys_addr);
19718 +               phys_addr += PAGE_SIZE;
19719 +               --idx;
19720 +               --nrpages;
19721 +       }
19722 +       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
19723 +}
19724 +
19725 +void __init bt_iounmap(void *addr, unsigned long size)
19726 +{
19727 +       unsigned long virt_addr;
19728 +       unsigned long offset;
19729 +       unsigned int nrpages;
19730 +       enum fixed_addresses idx;
19731 +
19732 +       virt_addr = (unsigned long)addr;
19733 +       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
19734 +               return;
19735 +       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
19736 +               return;
19737 +       offset = virt_addr & ~PAGE_MASK;
19738 +       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
19739 +
19740 +       idx = FIX_BTMAP_BEGIN;
19741 +       while (nrpages > 0) {
19742 +               clear_fixmap(idx);
19743 +               --idx;
19744 +               --nrpages;
19745 +       }
19746 +}
19747 diff -ruNp linux-2.6.19/arch/i386/mm/pageattr.c linux-2.6.19-xen-3.0.4/arch/i386/mm/pageattr.c
19748 --- linux-2.6.19/arch/i386/mm/pageattr.c        2006-11-29 21:57:37.000000000 +0000
19749 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/pageattr.c      2007-02-02 19:10:21.000000000 +0000
19750 @@ -84,7 +84,7 @@ static void set_pmd_pte(pte_t *kpte, uns
19751         unsigned long flags;
19752  
19753         set_pte_atomic(kpte, pte);      /* change init_mm */
19754 -       if (PTRS_PER_PMD > 1)
19755 +       if (HAVE_SHARED_KERNEL_PMD)
19756                 return;
19757  
19758         spin_lock_irqsave(&pgd_lock, flags);
19759 diff -ruNp linux-2.6.19/arch/i386/mm/pgtable-xen.c linux-2.6.19-xen-3.0.4/arch/i386/mm/pgtable-xen.c
19760 --- linux-2.6.19/arch/i386/mm/pgtable-xen.c     1970-01-01 00:00:00.000000000 +0000
19761 +++ linux-2.6.19-xen-3.0.4/arch/i386/mm/pgtable-xen.c   2007-02-02 19:10:21.000000000 +0000
19762 @@ -0,0 +1,728 @@
19763 +/*
19764 + *  linux/arch/i386/mm/pgtable.c
19765 + */
19766 +
19767 +#include <linux/sched.h>
19768 +#include <linux/kernel.h>
19769 +#include <linux/errno.h>
19770 +#include <linux/mm.h>
19771 +#include <linux/swap.h>
19772 +#include <linux/smp.h>
19773 +#include <linux/highmem.h>
19774 +#include <linux/slab.h>
19775 +#include <linux/pagemap.h>
19776 +#include <linux/spinlock.h>
19777 +#include <linux/module.h>
19778 +
19779 +#include <asm/system.h>
19780 +#include <asm/pgtable.h>
19781 +#include <asm/pgalloc.h>
19782 +#include <asm/fixmap.h>
19783 +#include <asm/e820.h>
19784 +#include <asm/tlb.h>
19785 +#include <asm/tlbflush.h>
19786 +#include <asm/io.h>
19787 +#include <asm/mmu_context.h>
19788 +
19789 +#include <xen/features.h>
19790 +#include <xen/foreign_page.h>
19791 +#include <asm/hypervisor.h>
19792 +
19793 +static void pgd_test_and_unpin(pgd_t *pgd);
19794 +
19795 +void show_mem(void)
19796 +{
19797 +       int total = 0, reserved = 0;
19798 +       int shared = 0, cached = 0;
19799 +       int highmem = 0;
19800 +       struct page *page;
19801 +       pg_data_t *pgdat;
19802 +       unsigned long i;
19803 +       unsigned long flags;
19804 +
19805 +       printk(KERN_INFO "Mem-info:\n");
19806 +       show_free_areas();
19807 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
19808 +       for_each_online_pgdat(pgdat) {
19809 +               pgdat_resize_lock(pgdat, &flags);
19810 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19811 +                       page = pgdat_page_nr(pgdat, i);
19812 +                       total++;
19813 +                       if (PageHighMem(page))
19814 +                               highmem++;
19815 +                       if (PageReserved(page))
19816 +                               reserved++;
19817 +                       else if (PageSwapCache(page))
19818 +                               cached++;
19819 +                       else if (page_count(page))
19820 +                               shared += page_count(page) - 1;
19821 +               }
19822 +               pgdat_resize_unlock(pgdat, &flags);
19823 +       }
19824 +       printk(KERN_INFO "%d pages of RAM\n", total);
19825 +       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
19826 +       printk(KERN_INFO "%d reserved pages\n", reserved);
19827 +       printk(KERN_INFO "%d pages shared\n", shared);
19828 +       printk(KERN_INFO "%d pages swap cached\n", cached);
19829 +
19830 +       printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
19831 +       printk(KERN_INFO "%lu pages writeback\n",
19832 +                                       global_page_state(NR_WRITEBACK));
19833 +       printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
19834 +       printk(KERN_INFO "%lu pages slab\n",
19835 +               global_page_state(NR_SLAB_RECLAIMABLE) +
19836 +               global_page_state(NR_SLAB_UNRECLAIMABLE));
19837 +       printk(KERN_INFO "%lu pages pagetables\n",
19838 +                                       global_page_state(NR_PAGETABLE));
19839 +}
19840 +
19841 +/*
19842 + * Associate a virtual page frame with a given physical page frame 
19843 + * and protection flags for that frame.
19844 + */ 
19845 +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
19846 +{
19847 +       pgd_t *pgd;
19848 +       pud_t *pud;
19849 +       pmd_t *pmd;
19850 +       pte_t *pte;
19851 +
19852 +       pgd = swapper_pg_dir + pgd_index(vaddr);
19853 +       if (pgd_none(*pgd)) {
19854 +               BUG();
19855 +               return;
19856 +       }
19857 +       pud = pud_offset(pgd, vaddr);
19858 +       if (pud_none(*pud)) {
19859 +               BUG();
19860 +               return;
19861 +       }
19862 +       pmd = pmd_offset(pud, vaddr);
19863 +       if (pmd_none(*pmd)) {
19864 +               BUG();
19865 +               return;
19866 +       }
19867 +       pte = pte_offset_kernel(pmd, vaddr);
19868 +       if (pgprot_val(flags))
19869 +               /* <pfn,flags> stored as-is, to permit clearing entries */
19870 +               set_pte(pte, pfn_pte(pfn, flags));
19871 +       else
19872 +               pte_clear(&init_mm, vaddr, pte);
19873 +
19874 +       /*
19875 +        * It's enough to flush this one mapping.
19876 +        * (PGE mappings get flushed as well)
19877 +        */
19878 +       __flush_tlb_one(vaddr);
19879 +}
19880 +
19881 +/*
19882 + * Associate a virtual page frame with a given physical page frame 
19883 + * and protection flags for that frame.
19884 + */ 
19885 +static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
19886 +                          pgprot_t flags)
19887 +{
19888 +       pgd_t *pgd;
19889 +       pud_t *pud;
19890 +       pmd_t *pmd;
19891 +       pte_t *pte;
19892 +
19893 +       pgd = swapper_pg_dir + pgd_index(vaddr);
19894 +       if (pgd_none(*pgd)) {
19895 +               BUG();
19896 +               return;
19897 +       }
19898 +       pud = pud_offset(pgd, vaddr);
19899 +       if (pud_none(*pud)) {
19900 +               BUG();
19901 +               return;
19902 +       }
19903 +       pmd = pmd_offset(pud, vaddr);
19904 +       if (pmd_none(*pmd)) {
19905 +               BUG();
19906 +               return;
19907 +       }
19908 +       pte = pte_offset_kernel(pmd, vaddr);
19909 +       if (pgprot_val(flags))
19910 +               /* <pfn,flags> stored as-is, to permit clearing entries */
19911 +               set_pte(pte, pfn_pte_ma(pfn, flags));
19912 +       else
19913 +               pte_clear(&init_mm, vaddr, pte);
19914 +
19915 +       /*
19916 +        * It's enough to flush this one mapping.
19917 +        * (PGE mappings get flushed as well)
19918 +        */
19919 +       __flush_tlb_one(vaddr);
19920 +}
19921 +
19922 +/*
19923 + * Associate a large virtual page frame with a given physical page frame 
19924 + * and protection flags for that frame. pfn is for the base of the page,
19925 + * vaddr is what the page gets mapped to - both must be properly aligned. 
19926 + * The pmd must already be instantiated. Assumes PAE mode.
19927 + */ 
19928 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
19929 +{
19930 +       pgd_t *pgd;
19931 +       pud_t *pud;
19932 +       pmd_t *pmd;
19933 +
19934 +       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
19935 +               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
19936 +               return; /* BUG(); */
19937 +       }
19938 +       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
19939 +               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
19940 +               return; /* BUG(); */
19941 +       }
19942 +       pgd = swapper_pg_dir + pgd_index(vaddr);
19943 +       if (pgd_none(*pgd)) {
19944 +               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
19945 +               return; /* BUG(); */
19946 +       }
19947 +       pud = pud_offset(pgd, vaddr);
19948 +       pmd = pmd_offset(pud, vaddr);
19949 +       set_pmd(pmd, pfn_pmd(pfn, flags));
19950 +       /*
19951 +        * It's enough to flush this one mapping.
19952 +        * (PGE mappings get flushed as well)
19953 +        */
19954 +       __flush_tlb_one(vaddr);
19955 +}
19956 +
19957 +static int fixmaps;
19958 +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
19959 +#ifndef CONFIG_COMPAT_VDSO
19960 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
19961 +EXPORT_SYMBOL(__FIXADDR_TOP);
19962 +#endif
19963 +
19964 +void __init set_fixaddr_top(void)
19965 +{
19966 +       BUG_ON(fixmaps > 0);
19967 +       __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
19968 +}
19969 +
19970 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
19971 +{
19972 +       unsigned long address = __fix_to_virt(idx);
19973 +
19974 +       if (idx >= __end_of_fixed_addresses) {
19975 +               BUG();
19976 +               return;
19977 +       }
19978 +       switch (idx) {
19979 +       case FIX_WP_TEST:
19980 +#ifdef CONFIG_X86_F00F_BUG
19981 +       case FIX_F00F_IDT:
19982 +#endif
19983 +               set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
19984 +               break;
19985 +       default:
19986 +               set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
19987 +               break;
19988 +       }
19989 +       fixmaps++;
19990 +}
19991 +
19992 +/**
19993 + * reserve_top_address - reserves a hole in the top of kernel address space
19994 + * @reserve - size of hole to reserve
19995 + *
19996 + * Can be used to relocate the fixmap area and poke a hole in the top
19997 + * of kernel address space to make room for a hypervisor.
19998 + */
19999 +void reserve_top_address(unsigned long reserve)
20000 +{
20001 +       BUG_ON(fixmaps > 0);
20002 +#ifdef CONFIG_COMPAT_VDSO
20003 +       BUG_ON(reserve != 0);
20004 +#else
20005 +       __FIXADDR_TOP = -reserve - PAGE_SIZE;
20006 +       __VMALLOC_RESERVE += reserve;
20007 +#endif
20008 +}
20009 +
20010 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
20011 +{
20012 +       pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
20013 +       if (pte)
20014 +               make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
20015 +       return pte;
20016 +}
20017 +
20018 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
20019 +{
20020 +       struct page *pte;
20021 +
20022 +#ifdef CONFIG_HIGHPTE
20023 +       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
20024 +#else
20025 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
20026 +       if (pte) {
20027 +               SetPageForeign(pte, pte_free);
20028 +               init_page_count(pte);
20029 +       }
20030 +#endif
20031 +       return pte;
20032 +}
20033 +
20034 +void pte_free(struct page *pte)
20035 +{
20036 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
20037 +
20038 +       if (!pte_write(*virt_to_ptep(va)))
20039 +               BUG_ON(HYPERVISOR_update_va_mapping(
20040 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
20041 +
20042 +       ClearPageForeign(pte);
20043 +       init_page_count(pte);
20044 +
20045 +       __free_page(pte);
20046 +}
20047 +
20048 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
20049 +{
20050 +       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
20051 +}
20052 +
20053 +/*
20054 + * List of all pgd's needed for non-PAE so it can invalidate entries
20055 + * in both cached and uncached pgd's; not needed for PAE since the
20056 + * kernel pmd is shared. If PAE were not to share the pmd a similar
20057 + * tactic would be needed. This is essentially codepath-based locking
20058 + * against pageattr.c; it is the unique case in which a valid change
20059 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
20060 + * vmalloc faults work because attached pagetables are never freed.
20061 + * The locking scheme was chosen on the basis of manfred's
20062 + * recommendations and having no core impact whatsoever.
20063 + * -- wli
20064 + */
20065 +DEFINE_SPINLOCK(pgd_lock);
20066 +struct page *pgd_list;
20067 +
20068 +static inline void pgd_list_add(pgd_t *pgd)
20069 +{
20070 +       struct page *page = virt_to_page(pgd);
20071 +       page->index = (unsigned long)pgd_list;
20072 +       if (pgd_list)
20073 +               set_page_private(pgd_list, (unsigned long)&page->index);
20074 +       pgd_list = page;
20075 +       set_page_private(page, (unsigned long)&pgd_list);
20076 +}
20077 +
20078 +static inline void pgd_list_del(pgd_t *pgd)
20079 +{
20080 +       struct page *next, **pprev, *page = virt_to_page(pgd);
20081 +       next = (struct page *)page->index;
20082 +       pprev = (struct page **)page_private(page);
20083 +       *pprev = next;
20084 +       if (next)
20085 +               set_page_private(next, (unsigned long)pprev);
20086 +}
20087 +
20088 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
20089 +{
20090 +       unsigned long flags;
20091 +
20092 +       if (PTRS_PER_PMD > 1) {
20093 +               if (HAVE_SHARED_KERNEL_PMD)
20094 +                       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
20095 +                                       swapper_pg_dir + USER_PTRS_PER_PGD,
20096 +                                       KERNEL_PGD_PTRS);
20097 +       } else {
20098 +               spin_lock_irqsave(&pgd_lock, flags);
20099 +               clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
20100 +                               swapper_pg_dir + USER_PTRS_PER_PGD,
20101 +                               KERNEL_PGD_PTRS);
20102 +               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
20103 +               pgd_list_add(pgd);
20104 +               spin_unlock_irqrestore(&pgd_lock, flags);
20105 +       }
20106 +}
20107 +
20108 +/* never called when PTRS_PER_PMD > 1 */
20109 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
20110 +{
20111 +       unsigned long flags; /* can be called from interrupt context */
20112 +
20113 +       spin_lock_irqsave(&pgd_lock, flags);
20114 +       pgd_list_del(pgd);
20115 +       spin_unlock_irqrestore(&pgd_lock, flags);
20116 +
20117 +       pgd_test_and_unpin(pgd);
20118 +}
20119 +
20120 +pgd_t *pgd_alloc(struct mm_struct *mm)
20121 +{
20122 +       int i;
20123 +       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
20124 +       pmd_t **pmd;
20125 +       unsigned long flags;
20126 +
20127 +       pgd_test_and_unpin(pgd);
20128 +
20129 +       if (PTRS_PER_PMD == 1 || !pgd)
20130 +               return pgd;
20131 +
20132 +       if (HAVE_SHARED_KERNEL_PMD) {
20133 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
20134 +                       pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
20135 +                       if (!pmd)
20136 +                               goto out_oom;
20137 +                       set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
20138 +               }
20139 +               return pgd;
20140 +       }
20141 +
20142 +       /*
20143 +        * We can race save/restore (if we sleep during a GFP_KERNEL memory
20144 +        * allocation). We therefore store virtual addresses of pmds as they
20145 +        * do not change across save/restore, and poke the machine addresses
20146 +        * into the pgdir under the pgd_lock.
20147 +        */
20148 +       pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
20149 +       if (!pmd) {
20150 +               kmem_cache_free(pgd_cache, pgd);
20151 +               return NULL;
20152 +       }
20153 +
20154 +       /* Allocate pmds, remember virtual addresses. */
20155 +       for (i = 0; i < PTRS_PER_PGD; ++i) {
20156 +               pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
20157 +               if (!pmd[i])
20158 +                       goto out_oom;
20159 +       }
20160 +
20161 +       spin_lock_irqsave(&pgd_lock, flags);
20162 +
20163 +       /* Protect against save/restore: move below 4GB under pgd_lock. */
20164 +       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
20165 +               int rc = xen_create_contiguous_region(
20166 +                       (unsigned long)pgd, 0, 32);
20167 +               if (rc) {
20168 +                       spin_unlock_irqrestore(&pgd_lock, flags);
20169 +                       goto out_oom;
20170 +               }
20171 +       }
20172 +
20173 +       /* Copy kernel pmd contents and write-protect the new pmds. */
20174 +       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
20175 +               unsigned long v = (unsigned long)i << PGDIR_SHIFT;
20176 +               pgd_t *kpgd = pgd_offset_k(v);
20177 +               pud_t *kpud = pud_offset(kpgd, v);
20178 +               pmd_t *kpmd = pmd_offset(kpud, v);
20179 +               memcpy(pmd[i], kpmd, PAGE_SIZE);
20180 +               make_lowmem_page_readonly(
20181 +                       pmd[i], XENFEAT_writable_page_tables);
20182 +       }
20183 +
20184 +       /* It is safe to poke machine addresses of pmds under the pmd_lock. */
20185 +       for (i = 0; i < PTRS_PER_PGD; i++)
20186 +               set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
20187 +
20188 +       /* Ensure this pgd gets picked up and pinned on save/restore. */
20189 +       pgd_list_add(pgd);
20190 +
20191 +       spin_unlock_irqrestore(&pgd_lock, flags);
20192 +
20193 +       kfree(pmd);
20194 +
20195 +       return pgd;
20196 +
20197 +out_oom:
20198 +       if (HAVE_SHARED_KERNEL_PMD) {
20199 +               for (i--; i >= 0; i--)
20200 +                       kmem_cache_free(pmd_cache,
20201 +                                       (void *)__va(pgd_val(pgd[i])-1));
20202 +       } else {
20203 +               for (i--; i >= 0; i--)
20204 +                       kmem_cache_free(pmd_cache, pmd[i]);
20205 +               kfree(pmd);
20206 +       }
20207 +       kmem_cache_free(pgd_cache, pgd);
20208 +       return NULL;
20209 +}
20210 +
20211 +void pgd_free(pgd_t *pgd)
20212 +{
20213 +       int i;
20214 +
20215 +       /*
20216 +        * After this the pgd should not be pinned for the duration of this
20217 +        * function's execution. We should never sleep and thus never race:
20218 +        *  1. User pmds will not become write-protected under our feet due
20219 +        *     to a concurrent mm_pin_all().
20220 +        *  2. The machine addresses in PGD entries will not become invalid
20221 +        *     due to a concurrent save/restore.
20222 +        */
20223 +       pgd_test_and_unpin(pgd);
20224 +
20225 +       /* in the PAE case user pgd entries are overwritten before usage */
20226 +       if (PTRS_PER_PMD > 1) {
20227 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
20228 +                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
20229 +                       kmem_cache_free(pmd_cache, pmd);
20230 +               }
20231 +
20232 +               if (!HAVE_SHARED_KERNEL_PMD) {
20233 +                       unsigned long flags;
20234 +                       spin_lock_irqsave(&pgd_lock, flags);
20235 +                       pgd_list_del(pgd);
20236 +                       spin_unlock_irqrestore(&pgd_lock, flags);
20237 +
20238 +                       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
20239 +                               pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
20240 +                               make_lowmem_page_writable(
20241 +                                       pmd, XENFEAT_writable_page_tables);
20242 +                               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
20243 +                               kmem_cache_free(pmd_cache, pmd);
20244 +                       }
20245 +
20246 +                       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
20247 +                               xen_destroy_contiguous_region(
20248 +                                       (unsigned long)pgd, 0);
20249 +               }
20250 +       }
20251 +
20252 +       /* in the non-PAE case, free_pgtables() clears user pgd entries */
20253 +       kmem_cache_free(pgd_cache, pgd);
20254 +}
20255 +
20256 +void make_lowmem_page_readonly(void *va, unsigned int feature)
20257 +{
20258 +       pte_t *pte;
20259 +       int rc;
20260 +
20261 +       if (xen_feature(feature))
20262 +               return;
20263 +
20264 +       pte = virt_to_ptep(va);
20265 +       rc = HYPERVISOR_update_va_mapping(
20266 +               (unsigned long)va, pte_wrprotect(*pte), 0);
20267 +       BUG_ON(rc);
20268 +}
20269 +
20270 +void make_lowmem_page_writable(void *va, unsigned int feature)
20271 +{
20272 +       pte_t *pte;
20273 +       int rc;
20274 +
20275 +       if (xen_feature(feature))
20276 +               return;
20277 +
20278 +       pte = virt_to_ptep(va);
20279 +       rc = HYPERVISOR_update_va_mapping(
20280 +               (unsigned long)va, pte_mkwrite(*pte), 0);
20281 +       BUG_ON(rc);
20282 +}
20283 +
20284 +void make_page_readonly(void *va, unsigned int feature)
20285 +{
20286 +       pte_t *pte;
20287 +       int rc;
20288 +
20289 +       if (xen_feature(feature))
20290 +               return;
20291 +
20292 +       pte = virt_to_ptep(va);
20293 +       rc = HYPERVISOR_update_va_mapping(
20294 +               (unsigned long)va, pte_wrprotect(*pte), 0);
20295 +       if (rc) /* fallback? */
20296 +               xen_l1_entry_update(pte, pte_wrprotect(*pte));
20297 +       if ((unsigned long)va >= (unsigned long)high_memory) {
20298 +               unsigned long pfn = pte_pfn(*pte);
20299 +#ifdef CONFIG_HIGHMEM
20300 +               if (pfn >= highstart_pfn)
20301 +                       kmap_flush_unused(); /* flush stale writable kmaps */
20302 +               else
20303 +#endif
20304 +                       make_lowmem_page_readonly(
20305 +                               phys_to_virt(pfn << PAGE_SHIFT), feature); 
20306 +       }
20307 +}
20308 +
20309 +void make_page_writable(void *va, unsigned int feature)
20310 +{
20311 +       pte_t *pte;
20312 +       int rc;
20313 +
20314 +       if (xen_feature(feature))
20315 +               return;
20316 +
20317 +       pte = virt_to_ptep(va);
20318 +       rc = HYPERVISOR_update_va_mapping(
20319 +               (unsigned long)va, pte_mkwrite(*pte), 0);
20320 +       if (rc) /* fallback? */
20321 +               xen_l1_entry_update(pte, pte_mkwrite(*pte));
20322 +       if ((unsigned long)va >= (unsigned long)high_memory) {
20323 +               unsigned long pfn = pte_pfn(*pte); 
20324 +#ifdef CONFIG_HIGHMEM
20325 +               if (pfn < highstart_pfn)
20326 +#endif
20327 +                       make_lowmem_page_writable(
20328 +                               phys_to_virt(pfn << PAGE_SHIFT), feature);
20329 +       }
20330 +}
20331 +
20332 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
20333 +{
20334 +       if (xen_feature(feature))
20335 +               return;
20336 +
20337 +       while (nr-- != 0) {
20338 +               make_page_readonly(va, feature);
20339 +               va = (void *)((unsigned long)va + PAGE_SIZE);
20340 +       }
20341 +}
20342 +
20343 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
20344 +{
20345 +       if (xen_feature(feature))
20346 +               return;
20347 +
20348 +       while (nr-- != 0) {
20349 +               make_page_writable(va, feature);
20350 +               va = (void *)((unsigned long)va + PAGE_SIZE);
20351 +       }
20352 +}
20353 +
20354 +static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
20355 +{
20356 +       struct page *page = virt_to_page(pt);
20357 +       unsigned long pfn = page_to_pfn(page);
20358 +
20359 +       if (PageHighMem(page))
20360 +               return;
20361 +       BUG_ON(HYPERVISOR_update_va_mapping(
20362 +               (unsigned long)__va(pfn << PAGE_SHIFT),
20363 +               pfn_pte(pfn, flags), 0));
20364 +}
20365 +
20366 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
20367 +{
20368 +       pgd_t *pgd = pgd_base;
20369 +       pud_t *pud;
20370 +       pmd_t *pmd;
20371 +       pte_t *pte;
20372 +       int    g, u, m;
20373 +
20374 +       if (xen_feature(XENFEAT_auto_translated_physmap))
20375 +               return;
20376 +
20377 +       for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
20378 +               if (pgd_none(*pgd))
20379 +                       continue;
20380 +               pud = pud_offset(pgd, 0);
20381 +               if (PTRS_PER_PUD > 1) /* not folded */
20382 +                       pgd_walk_set_prot(pud,flags);
20383 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
20384 +                       if (pud_none(*pud))
20385 +                               continue;
20386 +                       pmd = pmd_offset(pud, 0);
20387 +                       if (PTRS_PER_PMD > 1) /* not folded */
20388 +                               pgd_walk_set_prot(pmd,flags);
20389 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
20390 +                               if (pmd_none(*pmd))
20391 +                                       continue;
20392 +                               pte = pte_offset_kernel(pmd,0);
20393 +                               pgd_walk_set_prot(pte,flags);
20394 +                       }
20395 +               }
20396 +       }
20397 +
20398 +       BUG_ON(HYPERVISOR_update_va_mapping(
20399 +               (unsigned long)pgd_base,
20400 +               pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
20401 +               UVMF_TLB_FLUSH));
20402 +}
20403 +
20404 +static void __pgd_pin(pgd_t *pgd)
20405 +{
20406 +       pgd_walk(pgd, PAGE_KERNEL_RO);
20407 +       xen_pgd_pin(__pa(pgd));
20408 +       set_bit(PG_pinned, &virt_to_page(pgd)->flags);
20409 +}
20410 +
20411 +static void __pgd_unpin(pgd_t *pgd)
20412 +{
20413 +       xen_pgd_unpin(__pa(pgd));
20414 +       pgd_walk(pgd, PAGE_KERNEL);
20415 +       clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
20416 +}
20417 +
20418 +static void pgd_test_and_unpin(pgd_t *pgd)
20419 +{
20420 +       if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
20421 +               __pgd_unpin(pgd);
20422 +}
20423 +
20424 +void mm_pin(struct mm_struct *mm)
20425 +{
20426 +       if (xen_feature(XENFEAT_writable_page_tables))
20427 +               return;
20428 +       spin_lock(&mm->page_table_lock);
20429 +       __pgd_pin(mm->pgd);
20430 +       spin_unlock(&mm->page_table_lock);
20431 +}
20432 +
20433 +void mm_unpin(struct mm_struct *mm)
20434 +{
20435 +       if (xen_feature(XENFEAT_writable_page_tables))
20436 +               return;
20437 +       spin_lock(&mm->page_table_lock);
20438 +       __pgd_unpin(mm->pgd);
20439 +       spin_unlock(&mm->page_table_lock);
20440 +}
20441 +
20442 +void mm_pin_all(void)
20443 +{
20444 +       struct page *page;
20445 +
20446 +       /* Only pgds on the pgd_list please: none hidden in the slab cache. */
20447 +       kmem_cache_shrink(pgd_cache);
20448 +
20449 +       if (xen_feature(XENFEAT_writable_page_tables))
20450 +               return;
20451 +
20452 +       for (page = pgd_list; page; page = (struct page *)page->index) {
20453 +               if (!test_bit(PG_pinned, &page->flags))
20454 +                       __pgd_pin((pgd_t *)page_address(page));
20455 +       }
20456 +}
20457 +
20458 +void _arch_dup_mmap(struct mm_struct *mm)
20459 +{
20460 +       if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
20461 +               mm_pin(mm);
20462 +}
20463 +
20464 +void _arch_exit_mmap(struct mm_struct *mm)
20465 +{
20466 +       struct task_struct *tsk = current;
20467 +
20468 +       task_lock(tsk);
20469 +
20470 +       /*
20471 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
20472 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
20473 +        */
20474 +       if (tsk->active_mm == mm) {
20475 +               tsk->active_mm = &init_mm;
20476 +               atomic_inc(&init_mm.mm_count);
20477 +
20478 +               switch_mm(mm, &init_mm, tsk);
20479 +
20480 +               atomic_dec(&mm->mm_count);
20481 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
20482 +       }
20483 +
20484 +       task_unlock(tsk);
20485 +
20486 +       if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
20487 +           (atomic_read(&mm->mm_count) == 1) &&
20488 +           !mm->context.has_foreign_mappings)
20489 +               mm_unpin(mm);
20490 +}
20491 diff -ruNp linux-2.6.19/arch/i386/oprofile/Makefile linux-2.6.19-xen-3.0.4/arch/i386/oprofile/Makefile
20492 --- linux-2.6.19/arch/i386/oprofile/Makefile    2006-11-29 21:57:37.000000000 +0000
20493 +++ linux-2.6.19-xen-3.0.4/arch/i386/oprofile/Makefile  2007-02-02 19:10:21.000000000 +0000
20494 @@ -6,7 +6,14 @@ DRIVER_OBJS = $(addprefix ../../../drive
20495                 oprofilefs.o oprofile_stats.o  \
20496                 timer_int.o )
20497  
20498 +ifdef CONFIG_XEN
20499 +XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
20500 +                        xenoprofile.o)
20501 +oprofile-y                             := $(DRIVER_OBJS) \
20502 +                                          $(XENOPROF_COMMON_OBJS) xenoprof.o
20503 +else 
20504  oprofile-y                             := $(DRIVER_OBJS) init.o backtrace.o
20505  oprofile-$(CONFIG_X86_LOCAL_APIC)      += nmi_int.o op_model_athlon.o \
20506                                            op_model_ppro.o op_model_p4.o
20507  oprofile-$(CONFIG_X86_IO_APIC)         += nmi_timer_int.o
20508 +endif
20509 diff -ruNp linux-2.6.19/arch/i386/oprofile/xenoprof.c linux-2.6.19-xen-3.0.4/arch/i386/oprofile/xenoprof.c
20510 --- linux-2.6.19/arch/i386/oprofile/xenoprof.c  1970-01-01 00:00:00.000000000 +0000
20511 +++ linux-2.6.19-xen-3.0.4/arch/i386/oprofile/xenoprof.c        2007-02-02 19:10:21.000000000 +0000
20512 @@ -0,0 +1,179 @@
20513 +/**
20514 + * @file xenoprof.c
20515 + *
20516 + * @remark Copyright 2002 OProfile authors
20517 + * @remark Read the file COPYING
20518 + *
20519 + * @author John Levon <levon@movementarian.org>
20520 + *
20521 + * Modified by Aravind Menon and Jose Renato Santos for Xen
20522 + * These modifications are:
20523 + * Copyright (C) 2005 Hewlett-Packard Co.
20524 + *
20525 + * x86-specific part
20526 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
20527 + *                    VA Linux Systems Japan K.K.
20528 + */
20529 +
20530 +#include <linux/init.h>
20531 +#include <linux/oprofile.h>
20532 +#include <linux/sched.h>
20533 +#include <asm/pgtable.h>
20534 +
20535 +#include <xen/driver_util.h>
20536 +#include <xen/interface/xen.h>
20537 +#include <xen/interface/xenoprof.h>
20538 +#include <xen/xenoprof.h>
20539 +#include "op_counter.h"
20540 +
20541 +static unsigned int num_events = 0;
20542 +
20543 +void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
20544 +{
20545 +       num_events = init->num_events;
20546 +       /* just in case - make sure we do not overflow event list 
20547 +          (i.e. counter_config list) */
20548 +       if (num_events > OP_MAX_COUNTER) {
20549 +               num_events = OP_MAX_COUNTER;
20550 +               init->num_events = num_events;
20551 +       }
20552 +}
20553 +
20554 +void xenoprof_arch_counter(void)
20555 +{
20556 +       int i;
20557 +       struct xenoprof_counter counter;
20558 +
20559 +       for (i=0; i<num_events; i++) {
20560 +               counter.ind       = i;
20561 +               counter.count     = (uint64_t)counter_config[i].count;
20562 +               counter.enabled   = (uint32_t)counter_config[i].enabled;
20563 +               counter.event     = (uint32_t)counter_config[i].event;
20564 +               counter.kernel    = (uint32_t)counter_config[i].kernel;
20565 +               counter.user      = (uint32_t)counter_config[i].user;
20566 +               counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
20567 +               HYPERVISOR_xenoprof_op(XENOPROF_counter, 
20568 +                                      &counter);
20569 +       }
20570 +}
20571 +
20572 +void xenoprof_arch_start(void) 
20573 +{
20574 +       /* nothing */
20575 +}
20576 +
20577 +void xenoprof_arch_stop(void)
20578 +{
20579 +       /* nothing */
20580 +}
20581 +
20582 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
20583 +{
20584 +       if (sbuf->buffer) {
20585 +               vunmap(sbuf->buffer);
20586 +               sbuf->buffer = NULL;
20587 +       }
20588 +}
20589 +
20590 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
20591 +                                   struct xenoprof_shared_buffer * sbuf)
20592 +{
20593 +       int npages, ret;
20594 +       struct vm_struct *area;
20595 +
20596 +       sbuf->buffer = NULL;
20597 +       if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
20598 +               return ret;
20599 +
20600 +       npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
20601 +
20602 +       area = alloc_vm_area(npages * PAGE_SIZE);
20603 +       if (area == NULL)
20604 +               return -ENOMEM;
20605 +
20606 +       if ( (ret = direct_kernel_remap_pfn_range(
20607 +                     (unsigned long)area->addr,
20608 +                     get_buffer->buf_gmaddr >> PAGE_SHIFT,
20609 +                     npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
20610 +                     DOMID_SELF)) ) {
20611 +               vunmap(area->addr);
20612 +               return ret;
20613 +       }
20614 +
20615 +       sbuf->buffer = area->addr;
20616 +       return ret;
20617 +}
20618 +
20619 +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
20620 +                             struct xenoprof_shared_buffer * sbuf)
20621 +{
20622 +       int ret;
20623 +       int npages;
20624 +       struct vm_struct *area;
20625 +       pgprot_t prot = __pgprot(_KERNPG_TABLE);
20626 +
20627 +       sbuf->buffer = NULL;
20628 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
20629 +       if (ret)
20630 +               goto out;
20631 +
20632 +       npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
20633 +
20634 +       area = alloc_vm_area(npages * PAGE_SIZE);
20635 +       if (area == NULL) {
20636 +               ret = -ENOMEM;
20637 +               goto out;
20638 +       }
20639 +
20640 +       ret = direct_kernel_remap_pfn_range(
20641 +               (unsigned long)area->addr,
20642 +               pdomain->buf_gmaddr >> PAGE_SHIFT,
20643 +               npages * PAGE_SIZE, prot, DOMID_SELF);
20644 +       if (ret) {
20645 +               vunmap(area->addr);
20646 +               goto out;
20647 +       }
20648 +       sbuf->buffer = area->addr;
20649 +
20650 +out:
20651 +       return ret;
20652 +}
20653 +
20654 +struct op_counter_config counter_config[OP_MAX_COUNTER];
20655 +
20656 +int xenoprof_create_files(struct super_block * sb, struct dentry * root)
20657 +{
20658 +       unsigned int i;
20659 +
20660 +       for (i = 0; i < num_events; ++i) {
20661 +               struct dentry * dir;
20662 +               char buf[2];
20663
20664 +               snprintf(buf, 2, "%d", i);
20665 +               dir = oprofilefs_mkdir(sb, root, buf);
20666 +               oprofilefs_create_ulong(sb, dir, "enabled",
20667 +                                       &counter_config[i].enabled);
20668 +               oprofilefs_create_ulong(sb, dir, "event",
20669 +                                       &counter_config[i].event);
20670 +               oprofilefs_create_ulong(sb, dir, "count",
20671 +                                       &counter_config[i].count);
20672 +               oprofilefs_create_ulong(sb, dir, "unit_mask",
20673 +                                       &counter_config[i].unit_mask);
20674 +               oprofilefs_create_ulong(sb, dir, "kernel",
20675 +                                       &counter_config[i].kernel);
20676 +               oprofilefs_create_ulong(sb, dir, "user",
20677 +                                       &counter_config[i].user);
20678 +       }
20679 +
20680 +       return 0;
20681 +}
20682 +
20683 +int __init oprofile_arch_init(struct oprofile_operations * ops)
20684 +{
20685 +       return xenoprofile_init(ops);
20686 +}
20687 +
20688 +void oprofile_arch_exit(void)
20689 +{
20690 +       xenoprofile_exit();
20691 +}
20692 diff -ruNp linux-2.6.19/arch/i386/pci/Makefile linux-2.6.19-xen-3.0.4/arch/i386/pci/Makefile
20693 --- linux-2.6.19/arch/i386/pci/Makefile 2006-11-29 21:57:37.000000000 +0000
20694 +++ linux-2.6.19-xen-3.0.4/arch/i386/pci/Makefile       2007-02-02 19:10:21.000000000 +0000
20695 @@ -4,6 +4,10 @@ obj-$(CONFIG_PCI_BIOS)         += pcbios.o
20696  obj-$(CONFIG_PCI_MMCONFIG)     += mmconfig.o direct.o
20697  obj-$(CONFIG_PCI_DIRECT)       += direct.o
20698  
20699 +# pcifront should be after pcbios.o, mmconfig.o, and direct.o as it should only
20700 +# take over if direct access to the PCI bus is unavailable
20701 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
20702 +
20703  pci-y                          := fixup.o
20704  pci-$(CONFIG_ACPI)             += acpi.o
20705  pci-y                          += legacy.o irq.o
20706 @@ -12,3 +16,8 @@ pci-$(CONFIG_X86_VISWS)               := visws.o fixu
20707  pci-$(CONFIG_X86_NUMAQ)                := numa.o irq.o
20708  
20709  obj-y                          += $(pci-y) common.o early.o
20710 +
20711 +ifdef CONFIG_XEN
20712 +include $(srctree)/scripts/Makefile.xen
20713 +obj-y := $(call cherrypickxen, $(obj-y))
20714 +endif
20715 diff -ruNp linux-2.6.19/arch/i386/pci/irq-xen.c linux-2.6.19-xen-3.0.4/arch/i386/pci/irq-xen.c
20716 --- linux-2.6.19/arch/i386/pci/irq-xen.c        1970-01-01 00:00:00.000000000 +0000
20717 +++ linux-2.6.19-xen-3.0.4/arch/i386/pci/irq-xen.c      2007-02-02 19:10:21.000000000 +0000
20718 @@ -0,0 +1,1167 @@
20719 +/*
20720 + *     Low-Level PCI Support for PC -- Routing of Interrupts
20721 + *
20722 + *     (c) 1999--2000 Martin Mares <mj@ucw.cz>
20723 + */
20724 +
20725 +#include <linux/types.h>
20726 +#include <linux/kernel.h>
20727 +#include <linux/pci.h>
20728 +#include <linux/init.h>
20729 +#include <linux/slab.h>
20730 +#include <linux/interrupt.h>
20731 +#include <linux/dmi.h>
20732 +#include <asm/io.h>
20733 +#include <asm/smp.h>
20734 +#include <asm/io_apic.h>
20735 +#include <linux/irq.h>
20736 +#include <linux/acpi.h>
20737 +
20738 +#include "pci.h"
20739 +
20740 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
20741 +#define PIRQ_VERSION 0x0100
20742 +
20743 +static int broken_hp_bios_irq9;
20744 +static int acer_tm360_irqrouting;
20745 +
20746 +static struct irq_routing_table *pirq_table;
20747 +
20748 +static int pirq_enable_irq(struct pci_dev *dev);
20749 +
20750 +/*
20751 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
20752 + * Avoid using: 13, 14 and 15 (FP error and IDE).
20753 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
20754 + */
20755 +unsigned int pcibios_irq_mask = 0xfff8;
20756 +
20757 +static int pirq_penalty[16] = {
20758 +       1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
20759 +       0, 0, 0, 0, 1000, 100000, 100000, 100000
20760 +};
20761 +
20762 +struct irq_router {
20763 +       char *name;
20764 +       u16 vendor, device;
20765 +       int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
20766 +       int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
20767 +};
20768 +
20769 +struct irq_router_handler {
20770 +       u16 vendor;
20771 +       int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
20772 +};
20773 +
20774 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
20775 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
20776 +
20777 +/*
20778 + *  Check passed address for the PCI IRQ Routing Table signature
20779 + *  and perform checksum verification.
20780 + */
20781 +
20782 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
20783 +{
20784 +       struct irq_routing_table *rt;
20785 +       int i;
20786 +       u8 sum;
20787 +
20788 +       rt = (struct irq_routing_table *) addr;
20789 +       if (rt->signature != PIRQ_SIGNATURE ||
20790 +           rt->version != PIRQ_VERSION ||
20791 +           rt->size % 16 ||
20792 +           rt->size < sizeof(struct irq_routing_table))
20793 +               return NULL;
20794 +       sum = 0;
20795 +       for (i=0; i < rt->size; i++)
20796 +               sum += addr[i];
20797 +       if (!sum) {
20798 +               DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
20799 +               return rt;
20800 +       }
20801 +       return NULL;
20802 +}
20803 +
20804 +
20805 +
20806 +/*
20807 + *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
20808 + */
20809 +
20810 +static struct irq_routing_table * __init pirq_find_routing_table(void)
20811 +{
20812 +       u8 *addr;
20813 +       struct irq_routing_table *rt;
20814 +
20815 +#ifdef CONFIG_XEN
20816 +       if (!is_initial_xendomain())
20817 +               return NULL;
20818 +#endif
20819 +       if (pirq_table_addr) {
20820 +               rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
20821 +               if (rt)
20822 +                       return rt;
20823 +               printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
20824 +       }
20825 +       for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
20826 +               rt = pirq_check_routing_table(addr);
20827 +               if (rt)
20828 +                       return rt;
20829 +       }
20830 +       return NULL;
20831 +}
20832 +
20833 +/*
20834 + *  If we have a IRQ routing table, use it to search for peer host
20835 + *  bridges.  It's a gross hack, but since there are no other known
20836 + *  ways how to get a list of buses, we have to go this way.
20837 + */
20838 +
20839 +static void __init pirq_peer_trick(void)
20840 +{
20841 +       struct irq_routing_table *rt = pirq_table;
20842 +       u8 busmap[256];
20843 +       int i;
20844 +       struct irq_info *e;
20845 +
20846 +       memset(busmap, 0, sizeof(busmap));
20847 +       for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
20848 +               e = &rt->slots[i];
20849 +#ifdef DEBUG
20850 +               {
20851 +                       int j;
20852 +                       DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
20853 +                       for(j=0; j<4; j++)
20854 +                               DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
20855 +                       DBG("\n");
20856 +               }
20857 +#endif
20858 +               busmap[e->bus] = 1;
20859 +       }
20860 +       for(i = 1; i < 256; i++) {
20861 +               if (!busmap[i] || pci_find_bus(0, i))
20862 +                       continue;
20863 +               if (pci_scan_bus(i, &pci_root_ops, NULL))
20864 +                       printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
20865 +       }
20866 +       pcibios_last_bus = -1;
20867 +}
20868 +
20869 +/*
20870 + *  Code for querying and setting of IRQ routes on various interrupt routers.
20871 + */
20872 +
20873 +void eisa_set_level_irq(unsigned int irq)
20874 +{
20875 +       unsigned char mask = 1 << (irq & 7);
20876 +       unsigned int port = 0x4d0 + (irq >> 3);
20877 +       unsigned char val;
20878 +       static u16 eisa_irq_mask;
20879 +
20880 +       if (irq >= 16 || (1 << irq) & eisa_irq_mask)
20881 +               return;
20882 +
20883 +       eisa_irq_mask |= (1 << irq);
20884 +       printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
20885 +       val = inb(port);
20886 +       if (!(val & mask)) {
20887 +               DBG(KERN_DEBUG " -> edge");
20888 +               outb(val | mask, port);
20889 +       }
20890 +}
20891 +
20892 +/*
20893 + * Common IRQ routing practice: nybbles in config space,
20894 + * offset by some magic constant.
20895 + */
20896 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
20897 +{
20898 +       u8 x;
20899 +       unsigned reg = offset + (nr >> 1);
20900 +
20901 +       pci_read_config_byte(router, reg, &x);
20902 +       return (nr & 1) ? (x >> 4) : (x & 0xf);
20903 +}
20904 +
20905 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
20906 +{
20907 +       u8 x;
20908 +       unsigned reg = offset + (nr >> 1);
20909 +
20910 +       pci_read_config_byte(router, reg, &x);
20911 +       x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
20912 +       pci_write_config_byte(router, reg, x);
20913 +}
20914 +
20915 +/*
20916 + * ALI pirq entries are damn ugly, and completely undocumented.
20917 + * This has been figured out from pirq tables, and it's not a pretty
20918 + * picture.
20919 + */
20920 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20921 +{
20922 +       static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
20923 +
20924 +       return irqmap[read_config_nybble(router, 0x48, pirq-1)];
20925 +}
20926 +
20927 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20928 +{
20929 +       static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
20930 +       unsigned int val = irqmap[irq];
20931 +               
20932 +       if (val) {
20933 +               write_config_nybble(router, 0x48, pirq-1, val);
20934 +               return 1;
20935 +       }
20936 +       return 0;
20937 +}
20938 +
20939 +/*
20940 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
20941 + * just a pointer to the config space.
20942 + */
20943 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20944 +{
20945 +       u8 x;
20946 +
20947 +       pci_read_config_byte(router, pirq, &x);
20948 +       return (x < 16) ? x : 0;
20949 +}
20950 +
20951 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20952 +{
20953 +       pci_write_config_byte(router, pirq, irq);
20954 +       return 1;
20955 +}
20956 +
20957 +/*
20958 + * The VIA pirq rules are nibble-based, like ALI,
20959 + * but without the ugly irq number munging.
20960 + * However, PIRQD is in the upper instead of lower 4 bits.
20961 + */
20962 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20963 +{
20964 +       return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
20965 +}
20966 +
20967 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20968 +{
20969 +       write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
20970 +       return 1;
20971 +}
20972 +
20973 +/*
20974 + * The VIA pirq rules are nibble-based, like ALI,
20975 + * but without the ugly irq number munging.
20976 + * However, for 82C586, nibble map is different .
20977 + */
20978 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20979 +{
20980 +       static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
20981 +       return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
20982 +}
20983 +
20984 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20985 +{
20986 +       static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
20987 +       write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
20988 +       return 1;
20989 +}
20990 +
20991 +/*
20992 + * ITE 8330G pirq rules are nibble-based
20993 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
20994 + *       2+3 are both mapped to irq 9 on my system
20995 + */
20996 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20997 +{
20998 +       static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
20999 +       return read_config_nybble(router,0x43, pirqmap[pirq-1]);
21000 +}
21001 +
21002 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21003 +{
21004 +       static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
21005 +       write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
21006 +       return 1;
21007 +}
21008 +
21009 +/*
21010 + * OPTI: high four bits are nibble pointer..
21011 + * I wonder what the low bits do?
21012 + */
21013 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21014 +{
21015 +       return read_config_nybble(router, 0xb8, pirq >> 4);
21016 +}
21017 +
21018 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21019 +{
21020 +       write_config_nybble(router, 0xb8, pirq >> 4, irq);
21021 +       return 1;
21022 +}
21023 +
21024 +/*
21025 + * Cyrix: nibble offset 0x5C
21026 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA 
21027 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
21028 + */
21029 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21030 +{
21031 +       return read_config_nybble(router, 0x5C, (pirq-1)^1);
21032 +}
21033 +
21034 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21035 +{
21036 +       write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
21037 +       return 1;
21038 +}
21039 +
21040 +/*
21041 + *     PIRQ routing for SiS 85C503 router used in several SiS chipsets.
21042 + *     We have to deal with the following issues here:
21043 + *     - vendors have different ideas about the meaning of link values
21044 + *     - some onboard devices (integrated in the chipset) have special
21045 + *       links and are thus routed differently (i.e. not via PCI INTA-INTD)
21046 + *     - different revision of the router have a different layout for
21047 + *       the routing registers, particularly for the onchip devices
21048 + *
21049 + *     For all routing registers the common thing is we have one byte
21050 + *     per routeable link which is defined as:
21051 + *              bit 7      IRQ mapping enabled (0) or disabled (1)
21052 + *              bits [6:4] reserved (sometimes used for onchip devices)
21053 + *              bits [3:0] IRQ to map to
21054 + *                  allowed: 3-7, 9-12, 14-15
21055 + *                  reserved: 0, 1, 2, 8, 13
21056 + *
21057 + *     The config-space registers located at 0x41/0x42/0x43/0x44 are
21058 + *     always used to route the normal PCI INT A/B/C/D respectively.
21059 + *     Apparently there are systems implementing PCI routing table using
21060 + *     link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
21061 + *     We try our best to handle both link mappings.
21062 + *     
21063 + *     Currently (2003-05-21) it appears most SiS chipsets follow the
21064 + *     definition of routing registers from the SiS-5595 southbridge.
21065 + *     According to the SiS 5595 datasheets the revision id's of the
21066 + *     router (ISA-bridge) should be 0x01 or 0xb0.
21067 + *
21068 + *     Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
21069 + *     Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
21070 + *     They seem to work with the current routing code. However there is
21071 + *     some concern because of the two USB-OHCI HCs (original SiS 5595
21072 + *     had only one). YMMV.
21073 + *
21074 + *     Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
21075 + *
21076 + *     0x61:   IDEIRQ:
21077 + *             bits [6:5] must be written 01
21078 + *             bit 4 channel-select primary (0), secondary (1)
21079 + *
21080 + *     0x62:   USBIRQ:
21081 + *             bit 6 OHCI function disabled (0), enabled (1)
21082 + *     
21083 + *     0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
21084 + *
21085 + *     0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
21086 + *
21087 + *     We support USBIRQ (in addition to INTA-INTD) and keep the
21088 + *     IDE, ACPI and DAQ routing untouched as set by the BIOS.
21089 + *
21090 + *     Currently the only reported exception is the new SiS 65x chipset
21091 + *     which includes the SiS 69x southbridge. Here we have the 85C503
21092 + *     router revision 0x04 and there are changes in the register layout
21093 + *     mostly related to the different USB HCs with USB 2.0 support.
21094 + *
21095 + *     Onchip routing for router rev-id 0x04 (try-and-error observation)
21096 + *
21097 + *     0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
21098 + *                             bit 6-4 are probably unused, not like 5595
21099 + */
21100 +
21101 +#define PIRQ_SIS_IRQ_MASK      0x0f
21102 +#define PIRQ_SIS_IRQ_DISABLE   0x80
21103 +#define PIRQ_SIS_USB_ENABLE    0x40
21104 +
21105 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21106 +{
21107 +       u8 x;
21108 +       int reg;
21109 +
21110 +       reg = pirq;
21111 +       if (reg >= 0x01 && reg <= 0x04)
21112 +               reg += 0x40;
21113 +       pci_read_config_byte(router, reg, &x);
21114 +       return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
21115 +}
21116 +
21117 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21118 +{
21119 +       u8 x;
21120 +       int reg;
21121 +
21122 +       reg = pirq;
21123 +       if (reg >= 0x01 && reg <= 0x04)
21124 +               reg += 0x40;
21125 +       pci_read_config_byte(router, reg, &x);
21126 +       x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
21127 +       x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
21128 +       pci_write_config_byte(router, reg, x);
21129 +       return 1;
21130 +}
21131 +
21132 +
21133 +/*
21134 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
21135 + *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
21136 + *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
21137 + *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
21138 + *       for the busbridge to the docking station.
21139 + */
21140 +
21141 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21142 +{
21143 +       if (pirq > 8) {
21144 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21145 +               return 0;
21146 +       }
21147 +       return read_config_nybble(router, 0x74, pirq-1);
21148 +}
21149 +
21150 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21151 +{
21152 +       if (pirq > 8) {
21153 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
21154 +               return 0;
21155 +       }
21156 +       write_config_nybble(router, 0x74, pirq-1, irq);
21157 +       return 1;
21158 +}
21159 +
21160 +/*
21161 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
21162 + * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
21163 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
21164 + * register is a straight binary coding of desired PIC IRQ (low nibble).
21165 + *
21166 + * The 'link' value in the PIRQ table is already in the correct format
21167 + * for the Index register.  There are some special index values:
21168 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
21169 + * and 0x03 for SMBus.
21170 + */
21171 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21172 +{
21173 +       outb_p(pirq, 0xc00);
21174 +       return inb(0xc01) & 0xf;
21175 +}
21176 +
21177 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21178 +{
21179 +       outb_p(pirq, 0xc00);
21180 +       outb_p(irq, 0xc01);
21181 +       return 1;
21182 +}
21183 +
21184 +/* Support for AMD756 PCI IRQ Routing
21185 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
21186 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
21187 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
21188 + * The AMD756 pirq rules are nibble-based
21189 + * offset 0x56 0-3 PIRQA  4-7  PIRQB
21190 + * offset 0x57 0-3 PIRQC  4-7  PIRQD
21191 + */
21192 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
21193 +{
21194 +       u8 irq;
21195 +       irq = 0;
21196 +       if (pirq <= 4)
21197 +       {
21198 +               irq = read_config_nybble(router, 0x56, pirq - 1);
21199 +       }
21200 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
21201 +               dev->vendor, dev->device, pirq, irq);
21202 +       return irq;
21203 +}
21204 +
21205 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21206 +{
21207 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 
21208 +               dev->vendor, dev->device, pirq, irq);
21209 +       if (pirq <= 4)
21210 +       {
21211 +               write_config_nybble(router, 0x56, pirq - 1, irq);
21212 +       }
21213 +       return 1;
21214 +}
21215 +
21216 +#ifdef CONFIG_PCI_BIOS
21217 +
21218 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
21219 +{
21220 +       struct pci_dev *bridge;
21221 +       int pin = pci_get_interrupt_pin(dev, &bridge);
21222 +       return pcibios_set_irq_routing(bridge, pin, irq);
21223 +}
21224 +
21225 +#endif
21226 +
21227 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21228 +{
21229 +       static struct pci_device_id __initdata pirq_440gx[] = {
21230 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
21231 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
21232 +               { },
21233 +       };
21234 +
21235 +       /* 440GX has a proprietary PIRQ router -- don't use it */
21236 +       if (pci_dev_present(pirq_440gx))
21237 +               return 0;
21238 +
21239 +       switch(device)
21240 +       {
21241 +               case PCI_DEVICE_ID_INTEL_82371FB_0:
21242 +               case PCI_DEVICE_ID_INTEL_82371SB_0:
21243 +               case PCI_DEVICE_ID_INTEL_82371AB_0:
21244 +               case PCI_DEVICE_ID_INTEL_82371MX:
21245 +               case PCI_DEVICE_ID_INTEL_82443MX_0:
21246 +               case PCI_DEVICE_ID_INTEL_82801AA_0:
21247 +               case PCI_DEVICE_ID_INTEL_82801AB_0:
21248 +               case PCI_DEVICE_ID_INTEL_82801BA_0:
21249 +               case PCI_DEVICE_ID_INTEL_82801BA_10:
21250 +               case PCI_DEVICE_ID_INTEL_82801CA_0:
21251 +               case PCI_DEVICE_ID_INTEL_82801CA_12:
21252 +               case PCI_DEVICE_ID_INTEL_82801DB_0:
21253 +               case PCI_DEVICE_ID_INTEL_82801E_0:
21254 +               case PCI_DEVICE_ID_INTEL_82801EB_0:
21255 +               case PCI_DEVICE_ID_INTEL_ESB_1:
21256 +               case PCI_DEVICE_ID_INTEL_ICH6_0:
21257 +               case PCI_DEVICE_ID_INTEL_ICH6_1:
21258 +               case PCI_DEVICE_ID_INTEL_ICH7_0:
21259 +               case PCI_DEVICE_ID_INTEL_ICH7_1:
21260 +               case PCI_DEVICE_ID_INTEL_ICH7_30:
21261 +               case PCI_DEVICE_ID_INTEL_ICH7_31:
21262 +               case PCI_DEVICE_ID_INTEL_ESB2_0:
21263 +               case PCI_DEVICE_ID_INTEL_ICH8_0:
21264 +               case PCI_DEVICE_ID_INTEL_ICH8_1:
21265 +               case PCI_DEVICE_ID_INTEL_ICH8_2:
21266 +               case PCI_DEVICE_ID_INTEL_ICH8_3:
21267 +               case PCI_DEVICE_ID_INTEL_ICH8_4:
21268 +                       r->name = "PIIX/ICH";
21269 +                       r->get = pirq_piix_get;
21270 +                       r->set = pirq_piix_set;
21271 +                       return 1;
21272 +       }
21273 +       return 0;
21274 +}
21275 +
21276 +static __init int via_router_probe(struct irq_router *r,
21277 +                               struct pci_dev *router, u16 device)
21278 +{
21279 +       /* FIXME: We should move some of the quirk fixup stuff here */
21280 +
21281 +       /*
21282 +        * work arounds for some buggy BIOSes
21283 +        */
21284 +       if (device == PCI_DEVICE_ID_VIA_82C586_0) {
21285 +               switch(router->device) {
21286 +               case PCI_DEVICE_ID_VIA_82C686:
21287 +                       /*
21288 +                        * Asus k7m bios wrongly reports 82C686A
21289 +                        * as 586-compatible
21290 +                        */
21291 +                       device = PCI_DEVICE_ID_VIA_82C686;
21292 +                       break;
21293 +               case PCI_DEVICE_ID_VIA_8235:
21294 +                       /**
21295 +                        * Asus a7v-x bios wrongly reports 8235
21296 +                        * as 586-compatible
21297 +                        */
21298 +                       device = PCI_DEVICE_ID_VIA_8235;
21299 +                       break;
21300 +               }
21301 +       }
21302 +
21303 +       switch(device) {
21304 +       case PCI_DEVICE_ID_VIA_82C586_0:
21305 +               r->name = "VIA";
21306 +               r->get = pirq_via586_get;
21307 +               r->set = pirq_via586_set;
21308 +               return 1;
21309 +       case PCI_DEVICE_ID_VIA_82C596:
21310 +       case PCI_DEVICE_ID_VIA_82C686:
21311 +       case PCI_DEVICE_ID_VIA_8231:
21312 +       case PCI_DEVICE_ID_VIA_8233A:
21313 +       case PCI_DEVICE_ID_VIA_8235:
21314 +       case PCI_DEVICE_ID_VIA_8237:
21315 +               /* FIXME: add new ones for 8233/5 */
21316 +               r->name = "VIA";
21317 +               r->get = pirq_via_get;
21318 +               r->set = pirq_via_set;
21319 +               return 1;
21320 +       }
21321 +       return 0;
21322 +}
21323 +
21324 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21325 +{
21326 +       switch(device)
21327 +       {
21328 +               case PCI_DEVICE_ID_VLSI_82C534:
21329 +                       r->name = "VLSI 82C534";
21330 +                       r->get = pirq_vlsi_get;
21331 +                       r->set = pirq_vlsi_set;
21332 +                       return 1;
21333 +       }
21334 +       return 0;
21335 +}
21336 +
21337 +
21338 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21339 +{
21340 +       switch(device)
21341 +       {
21342 +               case PCI_DEVICE_ID_SERVERWORKS_OSB4:
21343 +               case PCI_DEVICE_ID_SERVERWORKS_CSB5:
21344 +                       r->name = "ServerWorks";
21345 +                       r->get = pirq_serverworks_get;
21346 +                       r->set = pirq_serverworks_set;
21347 +                       return 1;
21348 +       }
21349 +       return 0;
21350 +}
21351 +
21352 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21353 +{
21354 +       if (device != PCI_DEVICE_ID_SI_503)
21355 +               return 0;
21356 +               
21357 +       r->name = "SIS";
21358 +       r->get = pirq_sis_get;
21359 +       r->set = pirq_sis_set;
21360 +       return 1;
21361 +}
21362 +
21363 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21364 +{
21365 +       switch(device)
21366 +       {
21367 +               case PCI_DEVICE_ID_CYRIX_5520:
21368 +                       r->name = "NatSemi";
21369 +                       r->get = pirq_cyrix_get;
21370 +                       r->set = pirq_cyrix_set;
21371 +                       return 1;
21372 +       }
21373 +       return 0;
21374 +}
21375 +
21376 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21377 +{
21378 +       switch(device)
21379 +       {
21380 +               case PCI_DEVICE_ID_OPTI_82C700:
21381 +                       r->name = "OPTI";
21382 +                       r->get = pirq_opti_get;
21383 +                       r->set = pirq_opti_set;
21384 +                       return 1;
21385 +       }
21386 +       return 0;
21387 +}
21388 +
21389 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21390 +{
21391 +       switch(device)
21392 +       {
21393 +               case PCI_DEVICE_ID_ITE_IT8330G_0:
21394 +                       r->name = "ITE";
21395 +                       r->get = pirq_ite_get;
21396 +                       r->set = pirq_ite_set;
21397 +                       return 1;
21398 +       }
21399 +       return 0;
21400 +}
21401 +
21402 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21403 +{
21404 +       switch(device)
21405 +       {
21406 +       case PCI_DEVICE_ID_AL_M1533:
21407 +       case PCI_DEVICE_ID_AL_M1563:
21408 +               printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
21409 +               r->name = "ALI";
21410 +               r->get = pirq_ali_get;
21411 +               r->set = pirq_ali_set;
21412 +               return 1;
21413 +       }
21414 +       return 0;
21415 +}
21416 +
21417 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
21418 +{
21419 +       switch(device)
21420 +       {
21421 +               case PCI_DEVICE_ID_AMD_VIPER_740B:
21422 +                       r->name = "AMD756";
21423 +                       break;
21424 +               case PCI_DEVICE_ID_AMD_VIPER_7413:
21425 +                       r->name = "AMD766";
21426 +                       break;
21427 +               case PCI_DEVICE_ID_AMD_VIPER_7443:
21428 +                       r->name = "AMD768";
21429 +                       break;
21430 +               default:
21431 +                       return 0;
21432 +       }
21433 +       r->get = pirq_amd756_get;
21434 +       r->set = pirq_amd756_set;
21435 +       return 1;
21436 +}
21437 +               
21438 +static __initdata struct irq_router_handler pirq_routers[] = {
21439 +       { PCI_VENDOR_ID_INTEL, intel_router_probe },
21440 +       { PCI_VENDOR_ID_AL, ali_router_probe },
21441 +       { PCI_VENDOR_ID_ITE, ite_router_probe },
21442 +       { PCI_VENDOR_ID_VIA, via_router_probe },
21443 +       { PCI_VENDOR_ID_OPTI, opti_router_probe },
21444 +       { PCI_VENDOR_ID_SI, sis_router_probe },
21445 +       { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
21446 +       { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
21447 +       { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
21448 +       { PCI_VENDOR_ID_AMD, amd_router_probe },
21449 +       /* Someone with docs needs to add the ATI Radeon IGP */
21450 +       { 0, NULL }
21451 +};
21452 +static struct irq_router pirq_router;
21453 +static struct pci_dev *pirq_router_dev;
21454 +
21455 +
21456 +/*
21457 + *     FIXME: should we have an option to say "generic for
21458 + *     chipset" ?
21459 + */
21460
21461 +static void __init pirq_find_router(struct irq_router *r)
21462 +{
21463 +       struct irq_routing_table *rt = pirq_table;
21464 +       struct irq_router_handler *h;
21465 +
21466 +#ifdef CONFIG_PCI_BIOS
21467 +       if (!rt->signature) {
21468 +               printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
21469 +               r->set = pirq_bios_set;
21470 +               r->name = "BIOS";
21471 +               return;
21472 +       }
21473 +#endif
21474 +
21475 +       /* Default unless a driver reloads it */
21476 +       r->name = "default";
21477 +       r->get = NULL;
21478 +       r->set = NULL;
21479 +       
21480 +       DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
21481 +           rt->rtr_vendor, rt->rtr_device);
21482 +
21483 +       pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
21484 +       if (!pirq_router_dev) {
21485 +               DBG(KERN_DEBUG "PCI: Interrupt router not found at "
21486 +                       "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
21487 +               return;
21488 +       }
21489 +
21490 +       for( h = pirq_routers; h->vendor; h++) {
21491 +               /* First look for a router match */
21492 +               if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
21493 +                       break;
21494 +               /* Fall back to a device match */
21495 +               if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
21496 +                       break;
21497 +       }
21498 +       printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
21499 +               pirq_router.name,
21500 +               pirq_router_dev->vendor,
21501 +               pirq_router_dev->device,
21502 +               pci_name(pirq_router_dev));
21503 +}
21504 +
21505 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
21506 +{
21507 +       struct irq_routing_table *rt = pirq_table;
21508 +       int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
21509 +       struct irq_info *info;
21510 +
21511 +       for (info = rt->slots; entries--; info++)
21512 +               if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
21513 +                       return info;
21514 +       return NULL;
21515 +}
21516 +
21517 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
21518 +{
21519 +       u8 pin;
21520 +       struct irq_info *info;
21521 +       int i, pirq, newirq;
21522 +       int irq = 0;
21523 +       u32 mask;
21524 +       struct irq_router *r = &pirq_router;
21525 +       struct pci_dev *dev2 = NULL;
21526 +       char *msg = NULL;
21527 +
21528 +       /* Find IRQ pin */
21529 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21530 +       if (!pin) {
21531 +               DBG(KERN_DEBUG " -> no interrupt pin\n");
21532 +               return 0;
21533 +       }
21534 +       pin = pin - 1;
21535 +
21536 +       /* Find IRQ routing entry */
21537 +
21538 +       if (!pirq_table)
21539 +               return 0;
21540 +       
21541 +       DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
21542 +       info = pirq_get_info(dev);
21543 +       if (!info) {
21544 +               DBG(" -> not found in routing table\n" KERN_DEBUG);
21545 +               return 0;
21546 +       }
21547 +       pirq = info->irq[pin].link;
21548 +       mask = info->irq[pin].bitmap;
21549 +       if (!pirq) {
21550 +               DBG(" -> not routed\n" KERN_DEBUG);
21551 +               return 0;
21552 +       }
21553 +       DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
21554 +       mask &= pcibios_irq_mask;
21555 +
21556 +       /* Work around broken HP Pavilion Notebooks which assign USB to
21557 +          IRQ 9 even though it is actually wired to IRQ 11 */
21558 +
21559 +       if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
21560 +               dev->irq = 11;
21561 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
21562 +               r->set(pirq_router_dev, dev, pirq, 11);
21563 +       }
21564 +
21565 +       /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
21566 +       if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
21567 +               pirq = 0x68;
21568 +               mask = 0x400;
21569 +               dev->irq = r->get(pirq_router_dev, dev, pirq);
21570 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
21571 +       }
21572 +
21573 +       /*
21574 +        * Find the best IRQ to assign: use the one
21575 +        * reported by the device if possible.
21576 +        */
21577 +       newirq = dev->irq;
21578 +       if (newirq && !((1 << newirq) & mask)) {
21579 +               if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
21580 +               else printk("\n" KERN_WARNING
21581 +                       "PCI: IRQ %i for device %s doesn't match PIRQ mask "
21582 +                       "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
21583 +                       pci_name(dev));
21584 +       }
21585 +       if (!newirq && assign) {
21586 +               for (i = 0; i < 16; i++) {
21587 +                       if (!(mask & (1 << i)))
21588 +                               continue;
21589 +                       if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
21590 +                               newirq = i;
21591 +               }
21592 +       }
21593 +       DBG(" -> newirq=%d", newirq);
21594 +
21595 +       /* Check if it is hardcoded */
21596 +       if ((pirq & 0xf0) == 0xf0) {
21597 +               irq = pirq & 0xf;
21598 +               DBG(" -> hardcoded IRQ %d\n", irq);
21599 +               msg = "Hardcoded";
21600 +       } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
21601 +       ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
21602 +               DBG(" -> got IRQ %d\n", irq);
21603 +               msg = "Found";
21604 +               eisa_set_level_irq(irq);
21605 +       } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
21606 +               DBG(" -> assigning IRQ %d", newirq);
21607 +               if (r->set(pirq_router_dev, dev, pirq, newirq)) {
21608 +                       eisa_set_level_irq(newirq);
21609 +                       DBG(" ... OK\n");
21610 +                       msg = "Assigned";
21611 +                       irq = newirq;
21612 +               }
21613 +       }
21614 +
21615 +       if (!irq) {
21616 +               DBG(" ... failed\n");
21617 +               if (newirq && mask == (1 << newirq)) {
21618 +                       msg = "Guessed";
21619 +                       irq = newirq;
21620 +               } else
21621 +                       return 0;
21622 +       }
21623 +       printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
21624 +
21625 +       /* Update IRQ for all devices with the same pirq value */
21626 +       while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
21627 +               pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
21628 +               if (!pin)
21629 +                       continue;
21630 +               pin--;
21631 +               info = pirq_get_info(dev2);
21632 +               if (!info)
21633 +                       continue;
21634 +               if (info->irq[pin].link == pirq) {
21635 +                       /* We refuse to override the dev->irq information. Give a warning! */
21636 +                       if ( dev2->irq && dev2->irq != irq && \
21637 +                       (!(pci_probe & PCI_USE_PIRQ_MASK) || \
21638 +                       ((1 << dev2->irq) & mask)) ) {
21639 +#ifndef CONFIG_PCI_MSI
21640 +                               printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
21641 +                                      pci_name(dev2), dev2->irq, irq);
21642 +#endif
21643 +                               continue;
21644 +                       }
21645 +                       dev2->irq = irq;
21646 +                       pirq_penalty[irq]++;
21647 +                       if (dev != dev2)
21648 +                               printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
21649 +               }
21650 +       }
21651 +       return 1;
21652 +}
21653 +
21654 +static void __init pcibios_fixup_irqs(void)
21655 +{
21656 +       struct pci_dev *dev = NULL;
21657 +       u8 pin;
21658 +
21659 +       DBG(KERN_DEBUG "PCI: IRQ fixup\n");
21660 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
21661 +               /*
21662 +                * If the BIOS has set an out of range IRQ number, just ignore it.
21663 +                * Also keep track of which IRQ's are already in use.
21664 +                */
21665 +               if (dev->irq >= 16) {
21666 +                       DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
21667 +                       dev->irq = 0;
21668 +               }
21669 +               /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
21670 +               if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
21671 +                       pirq_penalty[dev->irq] = 0;
21672 +               pirq_penalty[dev->irq]++;
21673 +       }
21674 +
21675 +       dev = NULL;
21676 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
21677 +               pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21678 +#ifdef CONFIG_X86_IO_APIC
21679 +               /*
21680 +                * Recalculate IRQ numbers if we use the I/O APIC.
21681 +                */
21682 +               if (io_apic_assign_pci_irqs)
21683 +               {
21684 +                       int irq;
21685 +
21686 +                       if (pin) {
21687 +                               pin--;          /* interrupt pins are numbered starting from 1 */
21688 +                               irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
21689 +       /*
21690 +        * Busses behind bridges are typically not listed in the MP-table.
21691 +        * In this case we have to look up the IRQ based on the parent bus,
21692 +        * parent slot, and pin number. The SMP code detects such bridged
21693 +        * busses itself so we should get into this branch reliably.
21694 +        */
21695 +                               if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
21696 +                                       struct pci_dev * bridge = dev->bus->self;
21697 +
21698 +                                       pin = (pin + PCI_SLOT(dev->devfn)) % 4;
21699 +                                       irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
21700 +                                                       PCI_SLOT(bridge->devfn), pin);
21701 +                                       if (irq >= 0)
21702 +                                               printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
21703 +                                                       pci_name(bridge), 'A' + pin, irq);
21704 +                               }
21705 +                               if (irq >= 0) {
21706 +                                       printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
21707 +                                               pci_name(dev), 'A' + pin, irq);
21708 +                                       dev->irq = irq;
21709 +                               }
21710 +                       }
21711 +               }
21712 +#endif
21713 +               /*
21714 +                * Still no IRQ? Try to lookup one...
21715 +                */
21716 +               if (pin && !dev->irq)
21717 +                       pcibios_lookup_irq(dev, 0);
21718 +       }
21719 +}
21720 +
21721 +/*
21722 + * Work around broken HP Pavilion Notebooks which assign USB to
21723 + * IRQ 9 even though it is actually wired to IRQ 11
21724 + */
21725 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
21726 +{
21727 +       if (!broken_hp_bios_irq9) {
21728 +               broken_hp_bios_irq9 = 1;
21729 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
21730 +       }
21731 +       return 0;
21732 +}
21733 +
21734 +/*
21735 + * Work around broken Acer TravelMate 360 Notebooks which assign
21736 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
21737 + */
21738 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
21739 +{
21740 +       if (!acer_tm360_irqrouting) {
21741 +               acer_tm360_irqrouting = 1;
21742 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
21743 +       }
21744 +       return 0;
21745 +}
21746 +
21747 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
21748 +       {
21749 +               .callback = fix_broken_hp_bios_irq9,
21750 +               .ident = "HP Pavilion N5400 Series Laptop",
21751 +               .matches = {
21752 +                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
21753 +                       DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
21754 +                       DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
21755 +                       DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
21756 +               },
21757 +       },
21758 +       {
21759 +               .callback = fix_acer_tm360_irqrouting,
21760 +               .ident = "Acer TravelMate 36x Laptop",
21761 +               .matches = {
21762 +                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
21763 +                       DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
21764 +               },
21765 +       },
21766 +       { }
21767 +};
21768 +
21769 +static int __init pcibios_irq_init(void)
21770 +{
21771 +       DBG(KERN_DEBUG "PCI: IRQ init\n");
21772 +
21773 +       if (pcibios_enable_irq || raw_pci_ops == NULL)
21774 +               return 0;
21775 +
21776 +       dmi_check_system(pciirq_dmi_table);
21777 +
21778 +       pirq_table = pirq_find_routing_table();
21779 +
21780 +#ifdef CONFIG_PCI_BIOS
21781 +       if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
21782 +               pirq_table = pcibios_get_irq_routing_table();
21783 +#endif
21784 +       if (pirq_table) {
21785 +               pirq_peer_trick();
21786 +               pirq_find_router(&pirq_router);
21787 +               if (pirq_table->exclusive_irqs) {
21788 +                       int i;
21789 +                       for (i=0; i<16; i++)
21790 +                               if (!(pirq_table->exclusive_irqs & (1 << i)))
21791 +                                       pirq_penalty[i] += 100;
21792 +               }
21793 +               /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
21794 +               if (io_apic_assign_pci_irqs)
21795 +                       pirq_table = NULL;
21796 +       }
21797 +
21798 +       pcibios_enable_irq = pirq_enable_irq;
21799 +
21800 +       pcibios_fixup_irqs();
21801 +       return 0;
21802 +}
21803 +
21804 +subsys_initcall(pcibios_irq_init);
21805 +
21806 +
21807 +static void pirq_penalize_isa_irq(int irq, int active)
21808 +{
21809 +       /*
21810 +        *  If any ISAPnP device reports an IRQ in its list of possible
21811 +        *  IRQ's, we try to avoid assigning it to PCI devices.
21812 +        */
21813 +       if (irq < 16) {
21814 +               if (active)
21815 +                       pirq_penalty[irq] += 1000;
21816 +               else
21817 +                       pirq_penalty[irq] += 100;
21818 +       }
21819 +}
21820 +
21821 +void pcibios_penalize_isa_irq(int irq, int active)
21822 +{
21823 +#ifdef CONFIG_ACPI
21824 +       if (!acpi_noirq)
21825 +               acpi_penalize_isa_irq(irq, active);
21826 +       else
21827 +#endif
21828 +               pirq_penalize_isa_irq(irq, active);
21829 +}
21830 +
21831 +static int pirq_enable_irq(struct pci_dev *dev)
21832 +{
21833 +       u8 pin;
21834 +       struct pci_dev *temp_dev;
21835 +
21836 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21837 +       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
21838 +               char *msg = "";
21839 +
21840 +               pin--;          /* interrupt pins are numbered starting from 1 */
21841 +
21842 +               if (io_apic_assign_pci_irqs) {
21843 +                       int irq;
21844 +
21845 +                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
21846 +                       /*
21847 +                        * Busses behind bridges are typically not listed in the MP-table.
21848 +                        * In this case we have to look up the IRQ based on the parent bus,
21849 +                        * parent slot, and pin number. The SMP code detects such bridged
21850 +                        * busses itself so we should get into this branch reliably.
21851 +                        */
21852 +                       temp_dev = dev;
21853 +                       while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
21854 +                               struct pci_dev * bridge = dev->bus->self;
21855 +
21856 +                               pin = (pin + PCI_SLOT(dev->devfn)) % 4;
21857 +                               irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
21858 +                                               PCI_SLOT(bridge->devfn), pin);
21859 +                               if (irq >= 0)
21860 +                                       printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
21861 +                                               pci_name(bridge), 'A' + pin, irq);
21862 +                               dev = bridge;
21863 +                       }
21864 +                       dev = temp_dev;
21865 +                       if (irq >= 0) {
21866 +                               printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
21867 +                                       pci_name(dev), 'A' + pin, irq);
21868 +                               dev->irq = irq;
21869 +                               return 0;
21870 +                       } else
21871 +                               msg = " Probably buggy MP table.";
21872 +               } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
21873 +                       msg = "";
21874 +               else
21875 +                       msg = " Please try using pci=biosirq.";
21876 +
21877 +               /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
21878 +               if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
21879 +                       return 0;
21880 +
21881 +               printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
21882 +                      'A' + pin, pci_name(dev), msg);
21883 +       }
21884 +       return 0;
21885 +}
21886 diff -ruNp linux-2.6.19/arch/i386/pci/pcifront.c linux-2.6.19-xen-3.0.4/arch/i386/pci/pcifront.c
21887 --- linux-2.6.19/arch/i386/pci/pcifront.c       1970-01-01 00:00:00.000000000 +0000
21888 +++ linux-2.6.19-xen-3.0.4/arch/i386/pci/pcifront.c     2007-02-02 19:10:21.000000000 +0000
21889 @@ -0,0 +1,55 @@
21890 +/*
21891 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
21892 + *                     to support the Xen PCI Frontend's operation
21893 + *
21894 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
21895 + */
21896 +#include <linux/module.h>
21897 +#include <linux/init.h>
21898 +#include <linux/pci.h>
21899 +#include <asm/acpi.h>
21900 +#include "pci.h"
21901 +
21902 +static int pcifront_enable_irq(struct pci_dev *dev)
21903 +{
21904 +       u8 irq;
21905 +       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
21906 +       dev->irq = irq;
21907 +
21908 +       return 0;
21909 +}
21910 +
21911 +extern u8 pci_cache_line_size;
21912 +
21913 +static int __init pcifront_x86_stub_init(void)
21914 +{
21915 +       struct cpuinfo_x86 *c = &boot_cpu_data;
21916 +
21917 +       /* Only install our method if we haven't found real hardware already */
21918 +       if (raw_pci_ops)
21919 +               return 0;
21920 +
21921 +       printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
21922 +
21923 +       /* Copied from arch/i386/pci/common.c */
21924 +       pci_cache_line_size = 32 >> 2;
21925 +       if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
21926 +               pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
21927 +       else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
21928 +               pci_cache_line_size = 128 >> 2; /* P4 */
21929 +
21930 +       /* On x86, we need to disable the normal IRQ routing table and
21931 +        * just ask the backend
21932 +        */
21933 +       pcibios_enable_irq = pcifront_enable_irq;
21934 +       pcibios_disable_irq = NULL;
21935 +
21936 +#ifdef CONFIG_ACPI
21937 +       /* Keep ACPI out of the picture */
21938 +       acpi_noirq = 1;
21939 +#endif
21940 +
21941 +       return 0;
21942 +}
21943 +
21944 +arch_initcall(pcifront_x86_stub_init);
21945 diff -ruNp linux-2.6.19/arch/i386/power/Makefile linux-2.6.19-xen-3.0.4/arch/i386/power/Makefile
21946 --- linux-2.6.19/arch/i386/power/Makefile       2006-11-29 21:57:37.000000000 +0000
21947 +++ linux-2.6.19-xen-3.0.4/arch/i386/power/Makefile     2007-02-02 19:10:21.000000000 +0000
21948 @@ -1,2 +1,4 @@
21949 -obj-$(CONFIG_PM)               += cpu.o
21950 +obj-$(CONFIG_PM_LEGACY)                += cpu.o
21951 +obj-$(CONFIG_SOFTWARE_SUSPEND) += cpu.o
21952 +obj-$(CONFIG_ACPI_SLEEP)       += cpu.o
21953  obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o
21954 diff -ruNp linux-2.6.19/arch/ia64/Kconfig linux-2.6.19-xen-3.0.4/arch/ia64/Kconfig
21955 --- linux-2.6.19/arch/ia64/Kconfig      2006-11-29 21:57:37.000000000 +0000
21956 +++ linux-2.6.19-xen-3.0.4/arch/ia64/Kconfig    2007-02-02 19:10:21.000000000 +0000
21957 @@ -58,6 +58,34 @@ config GENERIC_IOMAP
21958         bool
21959         default y
21960  
21961 +config XEN
21962 +       bool "Xen hypervisor support"
21963 +       default y
21964 +       help
21965 +         Enable Xen hypervisor support.  Resulting kernel runs
21966 +         both as a guest OS on Xen and natively on hardware.
21967 +
21968 +config XEN_IA64_VDSO_PARAVIRT
21969 +       bool
21970 +       depends on XEN && !ITANIUM
21971 +       default y
21972 +       help
21973 +         vDSO paravirtualization
21974 +
21975 +config XEN_IA64_EXPOSE_P2M
21976 +       bool "Xen/IA64 exposure p2m table"
21977 +       depends on XEN
21978 +       default y
21979 +       help
21980 +         expose p2m from xen
21981 +
21982 +config XEN_IA64_EXPOSE_P2M_USE_DTR
21983 +       bool "Xen/IA64 map p2m table with dtr"
21984 +       depends on XEN_IA64_EXPOSE_P2M
21985 +       default y
21986 +       help
21987 +         use dtr to map the exposed p2m table
21988 +
21989  config SCHED_NO_NO_OMIT_FRAME_POINTER
21990         bool
21991         default y
21992 @@ -468,6 +496,21 @@ config PCI_DOMAINS
21993         bool
21994         default PCI
21995  
21996 +config XEN_PCIDEV_FRONTEND
21997 +       bool "Xen PCI Frontend"
21998 +       depends on PCI && XEN
21999 +       default y
22000 +       help
22001 +         The PCI device frontend driver allows the kernel to import arbitrary
22002 +         PCI devices from a PCI backend to support PCI driver domains.
22003 +
22004 +config XEN_PCIDEV_FE_DEBUG
22005 +       bool "Xen PCI Frontend Debugging"
22006 +       depends on XEN_PCIDEV_FRONTEND
22007 +       default n
22008 +       help
22009 +         Enables some debug statements within the PCI Frontend.
22010 +
22011  source "drivers/pci/pcie/Kconfig"
22012  
22013  source "drivers/pci/Kconfig"
22014 @@ -540,3 +583,32 @@ source "arch/ia64/Kconfig.debug"
22015  source "security/Kconfig"
22016  
22017  source "crypto/Kconfig"
22018 +
22019 +#
22020 +# override default values of drivers/xen/Kconfig
22021 +#
22022 +if XEN
22023 +config XEN_UTIL
22024 +       default n
22025 +
22026 +config HAVE_ARCH_ALLOC_SKB
22027 +       default y
22028 +
22029 +config HAVE_ARCH_DEV_ALLOC_SKB
22030 +       default y
22031 +
22032 +config XEN_BALLOON
22033 +       default y
22034 +
22035 +config XEN_SKBUFF
22036 +       default y
22037 +       depends on NET
22038 +
22039 +config XEN_REBOOT
22040 +       default y
22041 +
22042 +config XEN_SMPBOOT
22043 +       default n
22044 +endif
22045 +
22046 +source "drivers/xen/Kconfig"
22047 diff -ruNp linux-2.6.19/arch/ia64/Makefile linux-2.6.19-xen-3.0.4/arch/ia64/Makefile
22048 --- linux-2.6.19/arch/ia64/Makefile     2006-11-29 21:57:37.000000000 +0000
22049 +++ linux-2.6.19-xen-3.0.4/arch/ia64/Makefile   2007-02-02 19:10:21.000000000 +0000
22050 @@ -45,6 +45,12 @@ ifeq ($(call cc-version),0304)
22051  endif
22052  
22053  CFLAGS += $(cflags-y)
22054 +
22055 +cppflags-$(CONFIG_XEN) += \
22056 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
22057 +
22058 +CPPFLAGS += $(cppflags-y)
22059 +
22060  head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o
22061  
22062  libs-y                         += arch/ia64/lib/
22063 @@ -55,9 +61,15 @@ core-$(CONFIG_IA64_GENERIC)  += arch/ia6
22064  core-$(CONFIG_IA64_HP_ZX1)     += arch/ia64/dig/
22065  core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
22066  core-$(CONFIG_IA64_SGI_SN2)    += arch/ia64/sn/
22067 +core-$(CONFIG_XEN)             += arch/ia64/xen/
22068  
22069  drivers-$(CONFIG_PCI)          += arch/ia64/pci/
22070 +ifneq ($(CONFIG_XEN),y)
22071  drivers-$(CONFIG_IA64_HP_SIM)  += arch/ia64/hp/sim/
22072 +endif
22073 +ifneq ($(CONFIG_IA64_GENERIC),y)
22074 +drivers-$(CONFIG_XEN)          += arch/ia64/hp/sim/
22075 +endif
22076  drivers-$(CONFIG_IA64_HP_ZX1)  += arch/ia64/hp/common/ arch/ia64/hp/zx1/
22077  drivers-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
22078  drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/
22079 @@ -87,8 +99,8 @@ CLEAN_FILES += vmlinux.gz bootloader
22080  boot:  lib/lib.a vmlinux
22081         $(Q)$(MAKE) $(build)=$(boot) $@
22082  
22083 -install: vmlinux.gz
22084 -       sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) $< System.map "$(INSTALL_PATH)"
22085 +install:
22086 +       -yes | sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) vmlinux.gz System.map "$(INSTALL_PATH)"
22087  
22088  define archhelp
22089    echo '* compressed   - Build compressed kernel image'
22090 diff -ruNp linux-2.6.19/arch/ia64/dig/setup.c linux-2.6.19-xen-3.0.4/arch/ia64/dig/setup.c
22091 --- linux-2.6.19/arch/ia64/dig/setup.c  2006-11-29 21:57:37.000000000 +0000
22092 +++ linux-2.6.19-xen-3.0.4/arch/ia64/dig/setup.c        2007-02-02 19:10:21.000000000 +0000
22093 @@ -24,6 +24,8 @@
22094  #include <asm/machvec.h>
22095  #include <asm/system.h>
22096  
22097 +#include <xen/xencons.h>
22098 +
22099  void __init
22100  dig_setup (char **cmdline_p)
22101  {
22102 @@ -67,4 +69,19 @@ dig_setup (char **cmdline_p)
22103         screen_info.orig_video_mode = 3;        /* XXX fake */
22104         screen_info.orig_video_isVGA = 1;       /* XXX fake */
22105         screen_info.orig_video_ega_bx = 3;      /* XXX fake */
22106 +#ifdef CONFIG_XEN
22107 +       if (!is_running_on_xen() || !is_initial_xendomain())
22108 +               return;
22109 +
22110 +       if (xen_start_info->console.dom0.info_size >=
22111 +           sizeof(struct dom0_vga_console_info)) {
22112 +               const struct dom0_vga_console_info *info =
22113 +                       (struct dom0_vga_console_info *)(
22114 +                               (char *)xen_start_info +
22115 +                               xen_start_info->console.dom0.info_off);
22116 +               dom0_init_screen_info(info);
22117 +       }
22118 +       xen_start_info->console.domU.mfn = 0;
22119 +       xen_start_info->console.domU.evtchn = 0;
22120 +#endif
22121  }
22122 diff -ruNp linux-2.6.19/arch/ia64/hp/sim/Makefile linux-2.6.19-xen-3.0.4/arch/ia64/hp/sim/Makefile
22123 --- linux-2.6.19/arch/ia64/hp/sim/Makefile      2006-11-29 21:57:37.000000000 +0000
22124 +++ linux-2.6.19-xen-3.0.4/arch/ia64/hp/sim/Makefile    2007-02-02 19:10:21.000000000 +0000
22125 @@ -14,3 +14,5 @@ obj-$(CONFIG_HP_SIMETH)       += simeth.o
22126  obj-$(CONFIG_HP_SIMSERIAL) += simserial.o
22127  obj-$(CONFIG_HP_SIMSERIAL_CONSOLE) += hpsim_console.o
22128  obj-$(CONFIG_HP_SIMSCSI) += simscsi.o
22129 +obj-$(CONFIG_XEN) += simserial.o
22130 +obj-$(CONFIG_XEN) += hpsim_console.o
22131 diff -ruNp linux-2.6.19/arch/ia64/kernel/asm-offsets.c linux-2.6.19-xen-3.0.4/arch/ia64/kernel/asm-offsets.c
22132 --- linux-2.6.19/arch/ia64/kernel/asm-offsets.c 2006-11-29 21:57:37.000000000 +0000
22133 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/asm-offsets.c       2007-02-02 19:10:21.000000000 +0000
22134 @@ -268,4 +268,28 @@ void foo(void)
22135         DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
22136         DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
22137         DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
22138 +
22139 +#ifdef CONFIG_XEN
22140 +       BLANK();
22141 +
22142 +#define DEFINE_MAPPED_REG_OFS(sym, field) \
22143 +       DEFINE(sym, (XMAPPEDREGS_OFS + offsetof(mapped_regs_t, field)))
22144 +
22145 +       DEFINE_MAPPED_REG_OFS(XSI_PSR_I_ADDR_OFS, interrupt_mask_addr);
22146 +       DEFINE_MAPPED_REG_OFS(XSI_IPSR_OFS, ipsr);
22147 +       DEFINE_MAPPED_REG_OFS(XSI_IIP_OFS, iip);
22148 +       DEFINE_MAPPED_REG_OFS(XSI_IFS_OFS, ifs);
22149 +       DEFINE_MAPPED_REG_OFS(XSI_PRECOVER_IFS_OFS, precover_ifs);
22150 +       DEFINE_MAPPED_REG_OFS(XSI_ISR_OFS, isr);
22151 +       DEFINE_MAPPED_REG_OFS(XSI_IFA_OFS, ifa);
22152 +       DEFINE_MAPPED_REG_OFS(XSI_IIPA_OFS, iipa);
22153 +       DEFINE_MAPPED_REG_OFS(XSI_IIM_OFS, iim);
22154 +       DEFINE_MAPPED_REG_OFS(XSI_IHA_OFS, iha);
22155 +       DEFINE_MAPPED_REG_OFS(XSI_ITIR_OFS, itir);
22156 +       DEFINE_MAPPED_REG_OFS(XSI_PSR_IC_OFS, interrupt_collection_enabled);
22157 +       DEFINE_MAPPED_REG_OFS(XSI_INCOMPL_REGFR_OFS, incomplete_regframe);
22158 +       DEFINE_MAPPED_REG_OFS(XSI_BANKNUM_OFS, banknum);
22159 +       DEFINE_MAPPED_REG_OFS(XSI_BANK0_R16_OFS, bank0_regs[0]);
22160 +       DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]);
22161 +#endif /* CONFIG_XEN */
22162  }
22163 diff -ruNp linux-2.6.19/arch/ia64/kernel/entry.S linux-2.6.19-xen-3.0.4/arch/ia64/kernel/entry.S
22164 --- linux-2.6.19/arch/ia64/kernel/entry.S       2006-11-29 21:57:37.000000000 +0000
22165 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/entry.S     2007-02-02 19:10:21.000000000 +0000
22166 @@ -180,7 +180,7 @@ END(sys_clone)
22167   *     called.  The code starting at .map relies on this.  The rest of the code
22168   *     doesn't care about the interrupt masking status.
22169   */
22170 -GLOBAL_ENTRY(ia64_switch_to)
22171 +GLOBAL_ENTRY(__ia64_switch_to)
22172         .prologue
22173         alloc r16=ar.pfs,1,0,0,0
22174         DO_SAVE_SWITCH_STACK
22175 @@ -234,7 +234,7 @@ GLOBAL_ENTRY(ia64_switch_to)
22176         ;;
22177         srlz.d
22178         br.cond.sptk .done
22179 -END(ia64_switch_to)
22180 +END(__ia64_switch_to)
22181  
22182  /*
22183   * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
22184 @@ -375,7 +375,7 @@ END(save_switch_stack)
22185   *     - b7 holds address to return to
22186   *     - must not touch r8-r11
22187   */
22188 -ENTRY(load_switch_stack)
22189 +GLOBAL_ENTRY(load_switch_stack)
22190         .prologue
22191         .altrp b7
22192  
22193 @@ -510,7 +510,7 @@ END(clone)
22194          * because some system calls (such as ia64_execve) directly
22195          * manipulate ar.pfs.
22196          */
22197 -GLOBAL_ENTRY(ia64_trace_syscall)
22198 +GLOBAL_ENTRY(__ia64_trace_syscall)
22199         PT_REGS_UNWIND_INFO(0)
22200         /*
22201          * We need to preserve the scratch registers f6-f11 in case the system
22202 @@ -582,7 +582,7 @@ strace_error:
22203  (p6)   mov r10=-1
22204  (p6)   mov r8=r9
22205         br.cond.sptk .strace_save_retval
22206 -END(ia64_trace_syscall)
22207 +END(__ia64_trace_syscall)
22208  
22209         /*
22210          * When traced and returning from sigreturn, we invoke syscall_trace but then
22211 @@ -601,7 +601,7 @@ GLOBAL_ENTRY(ia64_strace_leave_kernel)
22212  .ret4: br.cond.sptk ia64_leave_kernel
22213  END(ia64_strace_leave_kernel)
22214  
22215 -GLOBAL_ENTRY(ia64_ret_from_clone)
22216 +GLOBAL_ENTRY(__ia64_ret_from_clone)
22217         PT_REGS_UNWIND_INFO(0)
22218  {      /*
22219          * Some versions of gas generate bad unwind info if the first instruction of a
22220 @@ -627,7 +627,7 @@ GLOBAL_ENTRY(ia64_ret_from_clone)
22221         cmp.ne p6,p0=r2,r0
22222  (p6)   br.cond.spnt .strace_check_retval
22223         ;;                                      // added stop bits to prevent r8 dependency
22224 -END(ia64_ret_from_clone)
22225 +END(__ia64_ret_from_clone)
22226         // fall through
22227  GLOBAL_ENTRY(ia64_ret_from_syscall)
22228         PT_REGS_UNWIND_INFO(0)
22229 @@ -635,8 +635,11 @@ GLOBAL_ENTRY(ia64_ret_from_syscall)
22230         adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
22231         mov r10=r0                              // clear error indication in r10
22232  (p7)   br.cond.spnt handle_syscall_error       // handle potential syscall failure
22233 +       ;;
22234 +       // don't fall through, ia64_leave_syscall may be #define'd
22235 +       br.cond.sptk.few ia64_leave_syscall
22236 +       ;;
22237  END(ia64_ret_from_syscall)
22238 -       // fall through
22239  /*
22240   * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
22241   *     need to switch to bank 0 and doesn't restore the scratch registers.
22242 @@ -681,7 +684,7 @@ END(ia64_ret_from_syscall)
22243   *           ar.csd: cleared
22244   *           ar.ssd: cleared
22245   */
22246 -ENTRY(ia64_leave_syscall)
22247 +GLOBAL_ENTRY(__ia64_leave_syscall)
22248         PT_REGS_UNWIND_INFO(0)
22249         /*
22250          * work.need_resched etc. mustn't get changed by this CPU before it returns to
22251 @@ -789,7 +792,7 @@ ENTRY(ia64_leave_syscall)
22252         mov.m ar.ssd=r0                 // M2   clear ar.ssd
22253         mov f11=f0                      // F    clear f11
22254         br.cond.sptk.many rbs_switch    // B
22255 -END(ia64_leave_syscall)
22256 +END(__ia64_leave_syscall)
22257  
22258  #ifdef CONFIG_IA32_SUPPORT
22259  GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
22260 @@ -801,10 +804,13 @@ GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
22261         st8.spill [r2]=r8       // store return value in slot for r8 and set unat bit
22262         .mem.offset 8,0
22263         st8.spill [r3]=r0       // clear error indication in slot for r10 and set unat bit
22264 +       ;;
22265 +       // don't fall through, ia64_leave_kernel may be #define'd
22266 +       br.cond.sptk.few ia64_leave_kernel
22267 +       ;;
22268  END(ia64_ret_from_ia32_execve)
22269 -       // fall through
22270  #endif /* CONFIG_IA32_SUPPORT */
22271 -GLOBAL_ENTRY(ia64_leave_kernel)
22272 +GLOBAL_ENTRY(__ia64_leave_kernel)
22273         PT_REGS_UNWIND_INFO(0)
22274         /*
22275          * work.need_resched etc. mustn't get changed by this CPU before it returns to
22276 @@ -1135,7 +1141,7 @@ skip_rbs_switch:
22277         ld8 r10=[r3]
22278         br.cond.sptk.many .work_processed_syscall       // re-check
22279  
22280 -END(ia64_leave_kernel)
22281 +END(__ia64_leave_kernel)
22282  
22283  ENTRY(handle_syscall_error)
22284         /*
22285 @@ -1175,7 +1181,7 @@ END(ia64_invoke_schedule_tail)
22286          * be set up by the caller.  We declare 8 input registers so the system call
22287          * args get preserved, in case we need to restart a system call.
22288          */
22289 -ENTRY(notify_resume_user)
22290 +GLOBAL_ENTRY(notify_resume_user)
22291         .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
22292         alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
22293         mov r9=ar.unat
22294 @@ -1263,7 +1269,7 @@ ENTRY(sys_rt_sigreturn)
22295         adds sp=16,sp
22296         ;;
22297         ld8 r9=[sp]                             // load new ar.unat
22298 -       mov.sptk b7=r8,ia64_leave_kernel
22299 +       mov.sptk b7=r8,__ia64_leave_kernel
22300         ;;
22301         mov ar.unat=r9
22302         br.many b7
22303 diff -ruNp linux-2.6.19/arch/ia64/kernel/gate.S linux-2.6.19-xen-3.0.4/arch/ia64/kernel/gate.S
22304 --- linux-2.6.19/arch/ia64/kernel/gate.S        2006-11-29 21:57:37.000000000 +0000
22305 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/gate.S      2007-02-02 19:10:21.000000000 +0000
22306 @@ -6,13 +6,15 @@
22307   *     David Mosberger-Tang <davidm@hpl.hp.com>
22308   */
22309  
22310 -
22311  #include <asm/asmmacro.h>
22312  #include <asm/errno.h>
22313  #include <asm/asm-offsets.h>
22314  #include <asm/sigcontext.h>
22315  #include <asm/system.h>
22316  #include <asm/unistd.h>
22317 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22318 +# include <asm/privop.h>
22319 +#endif
22320  
22321  /*
22322   * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
22323 @@ -32,6 +34,52 @@
22324  [1:](pr)brl.cond.sptk 0;                               \
22325         .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-.
22326  
22327 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22328 +       // The page in which hyperprivop lives must be pinned by ITR.
22329 +       // However vDSO area isn't pinned. So issuing hyperprivop
22330 +       // from vDSO page causes trouble that Kevin pointed out.
22331 +       // After clearing vpsr.ic, the vcpu is pre-empted and the itlb
22332 +       // is flushed. Then vcpu get cpu again, tlb miss fault occures.
22333 +       // However it results in nested dtlb fault because vpsr.ic is off.
22334 +       // To avoid such a situation, we jump into the kernel text area
22335 +       // which is pinned, and then issue hyperprivop and return back
22336 +       // to vDSO page.
22337 +       // This is Dan Magenheimer's idea.
22338 +
22339 +       // Currently is_running_on_xen() is defined as running_on_xen.
22340 +       // If is_running_on_xen() is a real function, we must update
22341 +       // according to it.
22342 +       .section ".data.patch.running_on_xen", "a"
22343 +       .previous
22344 +#define LOAD_RUNNING_ON_XEN(reg)                       \
22345 +[1:]   movl reg=0;                                     \
22346 +       .xdata4 ".data.patch.running_on_xen", 1b-.
22347 +
22348 +       .section ".data.patch.brl_xen_rsm_be_i", "a"
22349 +       .previous
22350 +#define BRL_COND_XEN_RSM_BE_I(pr)                      \
22351 +[1:](pr)brl.cond.sptk 0;                               \
22352 +       .xdata4 ".data.patch.brl_xen_rsm_be_i", 1b-.
22353 +
22354 +       .section ".data.patch.brl_xen_get_psr", "a"
22355 +       .previous
22356 +#define BRL_COND_XEN_GET_PSR(pr)                       \
22357 +[1:](pr)brl.cond.sptk 0;                               \
22358 +       .xdata4 ".data.patch.brl_xen_get_psr", 1b-.
22359 +
22360 +       .section ".data.patch.brl_xen_ssm_i_0", "a"
22361 +       .previous
22362 +#define BRL_COND_XEN_SSM_I_0(pr)                       \
22363 +[1:](pr)brl.cond.sptk 0;                               \
22364 +       .xdata4 ".data.patch.brl_xen_ssm_i_0", 1b-.
22365 +
22366 +       .section ".data.patch.brl_xen_ssm_i_1", "a"
22367 +       .previous
22368 +#define BRL_COND_XEN_SSM_I_1(pr)                       \
22369 +[1:](pr)brl.cond.sptk 0;                               \
22370 +       .xdata4 ".data.patch.brl_xen_ssm_i_1", 1b-.
22371 +#endif
22372 +
22373  GLOBAL_ENTRY(__kernel_syscall_via_break)
22374         .prologue
22375         .altrp b6
22376 @@ -76,7 +124,42 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
22377         epc                                     // B    causes split-issue
22378  }
22379         ;;
22380 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22381 +       // r20 = 1
22382 +       // r22 = &vcpu->vcpu_info->evtchn_upcall_mask
22383 +       // r23 = &vpsr.ic
22384 +       // r24 = &vcpu->vcpu_info->evtchn_upcall_pending
22385 +       // r25 = tmp
22386 +       // r28 = &running_on_xen
22387 +       // r30 = running_on_xen
22388 +       // r31 = tmp
22389 +       // p11 = tmp
22390 +       // p12 = running_on_xen
22391 +       // p13 = !running_on_xen
22392 +       // p14 = tmp
22393 +       // p15 = tmp
22394 +#define isXen  p12
22395 +#define isRaw  p13
22396 +       LOAD_RUNNING_ON_XEN(r28)
22397 +       movl r22=XSI_PSR_I_ADDR
22398 +       ;;
22399 +       ld8 r22=[r22]
22400 +       ;;
22401 +       movl r23=XSI_PSR_IC
22402 +       adds r24=-1,r22
22403 +       mov r20=1
22404 +       ;;
22405 +       ld4 r30=[r28]
22406 +       ;;
22407 +       cmp.ne isXen,isRaw=r0,r30
22408 +       ;;
22409 +(isRaw)        rsm psr.be | psr.i
22410 +       BRL_COND_XEN_RSM_BE_I(isXen)
22411 +       .global .vdso_rsm_be_i_ret
22412 +.vdso_rsm_be_i_ret:
22413 +#else
22414         rsm psr.be | psr.i                      // M2 (5 cyc to srlz.d)
22415 +#endif
22416         LOAD_FSYSCALL_TABLE(r14)                // X
22417         ;;
22418         mov r16=IA64_KR(CURRENT)                // M2 (12 cyc)
22419 @@ -84,7 +167,14 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
22420         mov r19=NR_syscalls-1                   // A
22421         ;;
22422         lfetch [r18]                            // M0|1
22423 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22424 +(isRaw)        mov r29=psr
22425 +       BRL_COND_XEN_GET_PSR(isXen)
22426 +       .global .vdso_get_psr_ret
22427 +.vdso_get_psr_ret:
22428 +#else
22429         mov r29=psr                             // M2 (12 cyc)
22430 +#endif
22431         // If r17 is a NaT, p6 will be zero
22432         cmp.geu p6,p7=r19,r17                   // A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
22433         ;;
22434 @@ -98,9 +188,21 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
22435         ;;
22436         nop.m 0
22437  (p6)   tbit.z.unc p8,p0=r18,0                  // I0 (dual-issues with "mov b7=r18"!)
22438 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22439 +       ;;
22440 +       // p14 = running_on_xen && p8
22441 +       // p15 = !running_on_xen && p8
22442 +(p8)   cmp.ne.unc p14,p15=r0,r30
22443 +       ;;
22444 +(p15)  ssm psr.i
22445 +       BRL_COND_XEN_SSM_I_0(p14)
22446 +       .global .vdso_ssm_i_0_ret
22447 +.vdso_ssm_i_0_ret:
22448 +#else
22449         nop.i 0
22450         ;;
22451  (p8)   ssm psr.i
22452 +#endif
22453  (p6)   mov b7=r18                              // I0
22454  (p8)   br.dptk.many b7                         // B
22455  
22456 @@ -121,9 +223,21 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
22457  #else
22458         BRL_COND_FSYS_BUBBLE_DOWN(p6)
22459  #endif
22460 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22461 +(isRaw)        ssm psr.i
22462 +       BRL_COND_XEN_SSM_I_1(isXen)
22463 +       .global .vdso_ssm_i_1_ret
22464 +.vdso_ssm_i_1_ret:
22465 +#else
22466         ssm psr.i
22467 +#endif
22468         mov r10=-1
22469  (p10)  mov r8=EINVAL
22470 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22471 +       dv_serialize_data // shut up gas warning.
22472 +                         // we know xen_hyper_ssm_i_0 or xen_hyper_ssm_i_1
22473 +                         // doesn't change p9 and p10
22474 +#endif
22475  (p9)   mov r8=ENOSYS
22476         FSYS_RETURN
22477  END(__kernel_syscall_via_epc)
22478 diff -ruNp linux-2.6.19/arch/ia64/kernel/gate.lds.S linux-2.6.19-xen-3.0.4/arch/ia64/kernel/gate.lds.S
22479 --- linux-2.6.19/arch/ia64/kernel/gate.lds.S    2006-11-29 21:57:37.000000000 +0000
22480 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/gate.lds.S  2007-02-02 19:10:21.000000000 +0000
22481 @@ -43,6 +43,28 @@ SECTIONS
22482                                     __start_gate_brl_fsys_bubble_down_patchlist = .;
22483                                     *(.data.patch.brl_fsys_bubble_down)
22484                                     __end_gate_brl_fsys_bubble_down_patchlist = .;
22485 +
22486 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22487 +                                   __start_gate_running_on_xen_patchlist = .;
22488 +                                   *(.data.patch.running_on_xen)
22489 +                                   __end_gate_running_on_xen_patchlist = .;
22490 +
22491 +                                   __start_gate_brl_xen_rsm_be_i_patchlist = .;
22492 +                                   *(.data.patch.brl_xen_rsm_be_i)
22493 +                                   __end_gate_brl_xen_rsm_be_i_patchlist = .;
22494 +
22495 +                                   __start_gate_brl_xen_get_psr_patchlist = .;
22496 +                                   *(.data.patch.brl_xen_get_psr)
22497 +                                   __end_gate_brl_xen_get_psr_patchlist = .;
22498 +
22499 +                                   __start_gate_brl_xen_ssm_i_0_patchlist = .;
22500 +                                   *(.data.patch.brl_xen_ssm_i_0)
22501 +                                   __end_gate_brl_xen_ssm_i_0_patchlist = .;
22502 +
22503 +                                   __start_gate_brl_xen_ssm_i_1_patchlist = .;
22504 +                                   *(.data.patch.brl_xen_ssm_i_1)
22505 +                                   __end_gate_brl_xen_ssm_i_1_patchlist = .;
22506 +#endif
22507    }                                                                    :readable
22508    .IA_64.unwind_info           : { *(.IA_64.unwind_info*) }
22509    .IA_64.unwind                        : { *(.IA_64.unwind*) }                 :readable :unwind
22510 diff -ruNp linux-2.6.19/arch/ia64/kernel/head.S linux-2.6.19-xen-3.0.4/arch/ia64/kernel/head.S
22511 --- linux-2.6.19/arch/ia64/kernel/head.S        2006-11-29 21:57:37.000000000 +0000
22512 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/head.S      2007-02-02 19:10:21.000000000 +0000
22513 @@ -367,6 +367,12 @@ start_ap:
22514         ;;
22515  (isBP) st8 [r2]=r28            // save the address of the boot param area passed by the bootloader
22516  
22517 +#ifdef CONFIG_XEN
22518 +       //  Note: isBP is used by the subprogram.
22519 +       br.call.sptk.many rp=early_xen_setup
22520 +       ;;
22521 +#endif
22522 +
22523  #ifdef CONFIG_SMP
22524  (isAP) br.call.sptk.many rp=start_secondary
22525  .ret0:
22526 diff -ruNp linux-2.6.19/arch/ia64/kernel/iosapic.c linux-2.6.19-xen-3.0.4/arch/ia64/kernel/iosapic.c
22527 --- linux-2.6.19/arch/ia64/kernel/iosapic.c     2006-11-29 21:57:37.000000000 +0000
22528 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/iosapic.c   2007-02-02 19:10:21.000000000 +0000
22529 @@ -159,6 +159,75 @@ static unsigned char pcat_compat __devin
22530  static int iosapic_kmalloc_ok;
22531  static LIST_HEAD(free_rte_list);
22532  
22533 +#ifdef CONFIG_XEN
22534 +#include <xen/interface/xen.h>
22535 +#include <xen/interface/physdev.h>
22536 +#include <asm/hypervisor.h>
22537 +static inline unsigned int xen_iosapic_read(char __iomem *iosapic, unsigned int reg)
22538 +{
22539 +       struct physdev_apic apic_op;
22540 +       int ret;
22541 +
22542 +       apic_op.apic_physbase = (unsigned long)iosapic -
22543 +                                       __IA64_UNCACHED_OFFSET;
22544 +       apic_op.reg = reg;
22545 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
22546 +       if (ret)
22547 +               return ret;
22548 +       return apic_op.value;
22549 +}
22550 +
22551 +static inline void xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
22552 +{
22553 +       struct physdev_apic apic_op;
22554 +
22555 +       apic_op.apic_physbase = (unsigned long)iosapic - 
22556 +                                       __IA64_UNCACHED_OFFSET;
22557 +       apic_op.reg = reg;
22558 +       apic_op.value = val;
22559 +       HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
22560 +}
22561 +
22562 +static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
22563 +{
22564 +       if (!is_running_on_xen()) {
22565 +               writel(reg, iosapic + IOSAPIC_REG_SELECT);
22566 +               return readl(iosapic + IOSAPIC_WINDOW);
22567 +       } else
22568 +               return xen_iosapic_read(iosapic, reg);
22569 +}
22570 +
22571 +static inline void iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
22572 +{
22573 +       if (!is_running_on_xen()) {
22574 +               writel(reg, iosapic + IOSAPIC_REG_SELECT);
22575 +               writel(val, iosapic + IOSAPIC_WINDOW);
22576 +       } else
22577 +               xen_iosapic_write(iosapic, reg, val);
22578 +}
22579 +
22580 +int xen_assign_irq_vector(int irq)
22581 +{
22582 +       struct physdev_irq irq_op;
22583 +
22584 +       irq_op.irq = irq;
22585 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
22586 +               return -ENOSPC;
22587 +
22588 +       return irq_op.vector;
22589 +}
22590 +
22591 +void xen_free_irq_vector(int vector)
22592 +{
22593 +       struct physdev_irq irq_op;
22594 +
22595 +       irq_op.vector = vector;
22596 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_free_irq_vector, &irq_op))
22597 +               printk(KERN_WARNING "%s: xen_free_irq_vecotr fail vector=%d\n",
22598 +                      __FUNCTION__, vector);
22599 +}
22600 +#endif /* XEN */
22601 +
22602  /*
22603   * Find an IOSAPIC associated with a GSI
22604   */
22605 @@ -653,6 +722,9 @@ register_intr (unsigned int gsi, int vec
22606         iosapic_intr_info[vector].dmode    = delivery;
22607         iosapic_intr_info[vector].trigger  = trigger;
22608  
22609 +       if (is_running_on_xen())
22610 +               return 0;
22611 +
22612         if (trigger == IOSAPIC_EDGE)
22613                 irq_type = &irq_type_iosapic_edge;
22614         else
22615 @@ -1015,6 +1087,9 @@ iosapic_system_init (int system_pcat_com
22616         }
22617  
22618         pcat_compat = system_pcat_compat;
22619 +       if (is_running_on_xen())
22620 +               return;
22621 +
22622         if (pcat_compat) {
22623                 /*
22624                  * Disable the compatibility mode interrupts (8259 style),
22625 diff -ruNp linux-2.6.19/arch/ia64/kernel/irq_ia64.c linux-2.6.19-xen-3.0.4/arch/ia64/kernel/irq_ia64.c
22626 --- linux-2.6.19/arch/ia64/kernel/irq_ia64.c    2006-11-29 21:57:37.000000000 +0000
22627 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/irq_ia64.c  2007-02-02 19:10:21.000000000 +0000
22628 @@ -31,6 +31,10 @@
22629  #include <linux/threads.h>
22630  #include <linux/bitops.h>
22631  #include <linux/irq.h>
22632 +#ifdef CONFIG_XEN
22633 +#include <linux/cpu.h>
22634 +#endif
22635 +
22636  
22637  #include <asm/delay.h>
22638  #include <asm/intrinsics.h>
22639 @@ -70,6 +74,13 @@ int
22640  assign_irq_vector (int irq)
22641  {
22642         int pos, vector;
22643 +
22644 +#ifdef CONFIG_XEN
22645 +       if (is_running_on_xen()) {
22646 +               extern int xen_assign_irq_vector(int);
22647 +               return xen_assign_irq_vector(irq);
22648 +       }
22649 +#endif
22650   again:
22651         pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
22652         vector = IA64_FIRST_DEVICE_VECTOR + pos;
22653 @@ -88,6 +99,13 @@ free_irq_vector (int vector)
22654         if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR)
22655                 return;
22656  
22657 +#ifdef CONFIG_XEN
22658 +       if (is_running_on_xen()) {
22659 +               extern void xen_free_irq_vector(int);
22660 +               xen_free_irq_vector(vector);
22661 +               return;
22662 +       }
22663 +#endif
22664         pos = vector - IA64_FIRST_DEVICE_VECTOR;
22665         if (!test_and_clear_bit(pos, ia64_vector_mask))
22666                 printk(KERN_WARNING "%s: double free!\n", __FUNCTION__);
22667 @@ -280,14 +298,270 @@ static struct irqaction resched_irqactio
22668  };
22669  #endif
22670  
22671 +#ifdef CONFIG_XEN
22672 +#include <xen/evtchn.h>
22673 +#include <xen/interface/callback.h>
22674 +
22675 +static DEFINE_PER_CPU(int, timer_irq) = -1;
22676 +static DEFINE_PER_CPU(int, ipi_irq) = -1;
22677 +static DEFINE_PER_CPU(int, resched_irq) = -1;
22678 +static DEFINE_PER_CPU(int, cmc_irq) = -1;
22679 +static DEFINE_PER_CPU(int, cmcp_irq) = -1;
22680 +static DEFINE_PER_CPU(int, cpep_irq) = -1;
22681 +static char timer_name[NR_CPUS][15];
22682 +static char ipi_name[NR_CPUS][15];
22683 +static char resched_name[NR_CPUS][15];
22684 +static char cmc_name[NR_CPUS][15];
22685 +static char cmcp_name[NR_CPUS][15];
22686 +static char cpep_name[NR_CPUS][15];
22687 +
22688 +struct saved_irq {
22689 +       unsigned int irq;
22690 +       struct irqaction *action;
22691 +};
22692 +/* 16 should be far optimistic value, since only several percpu irqs
22693 + * are registered early.
22694 + */
22695 +#define MAX_LATE_IRQ   16
22696 +static struct saved_irq saved_percpu_irqs[MAX_LATE_IRQ];
22697 +static unsigned short late_irq_cnt = 0;
22698 +static unsigned short saved_irq_cnt = 0;
22699 +static int xen_slab_ready = 0;
22700 +
22701 +#ifdef CONFIG_SMP
22702 +/* Dummy stub. Though we may check RESCHEDULE_VECTOR before __do_IRQ,
22703 + * it ends up to issue several memory accesses upon percpu data and
22704 + * thus adds unnecessary traffic to other paths.
22705 + */
22706 +static irqreturn_t
22707 +handle_reschedule(int irq, void *dev_id, struct pt_regs *regs)
22708 +{
22709 +
22710 +       return IRQ_HANDLED;
22711 +}
22712 +
22713 +static struct irqaction resched_irqaction = {
22714 +       .handler =      handle_reschedule,
22715 +       .flags =        SA_INTERRUPT,
22716 +       .name =         "RESCHED"
22717 +};
22718 +#endif
22719 +
22720 +/*
22721 + * This is xen version percpu irq registration, which needs bind
22722 + * to xen specific evtchn sub-system. One trick here is that xen
22723 + * evtchn binding interface depends on kmalloc because related
22724 + * port needs to be freed at device/cpu down. So we cache the
22725 + * registration on BSP before slab is ready and then deal them
22726 + * at later point. For rest instances happening after slab ready,
22727 + * we hook them to xen evtchn immediately.
22728 + *
22729 + * FIXME: MCA is not supported by far, and thus "nomca" boot param is
22730 + * required.
22731 + */
22732 +static void
22733 +xen_register_percpu_irq (unsigned int irq, struct irqaction *action, int save)
22734 +{
22735 +       unsigned int cpu = smp_processor_id();
22736 +       int ret = 0;
22737 +
22738 +       if (xen_slab_ready) {
22739 +               switch (irq) {
22740 +               case IA64_TIMER_VECTOR:
22741 +                       sprintf(timer_name[cpu], "%s%d", action->name, cpu);
22742 +                       ret = bind_virq_to_irqhandler(VIRQ_ITC, cpu,
22743 +                               action->handler, action->flags,
22744 +                               timer_name[cpu], action->dev_id);
22745 +                       per_cpu(timer_irq,cpu) = ret;
22746 +                       printk(KERN_INFO "register VIRQ_ITC (%s) to xen irq (%d)\n", timer_name[cpu], ret);
22747 +                       break;
22748 +               case IA64_IPI_RESCHEDULE:
22749 +                       sprintf(resched_name[cpu], "%s%d", action->name, cpu);
22750 +                       ret = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, cpu,
22751 +                               action->handler, action->flags,
22752 +                               resched_name[cpu], action->dev_id);
22753 +                       per_cpu(resched_irq,cpu) = ret;
22754 +                       printk(KERN_INFO "register RESCHEDULE_VECTOR (%s) to xen irq (%d)\n", resched_name[cpu], ret);
22755 +                       break;
22756 +               case IA64_IPI_VECTOR:
22757 +                       sprintf(ipi_name[cpu], "%s%d", action->name, cpu);
22758 +                       ret = bind_ipi_to_irqhandler(IPI_VECTOR, cpu,
22759 +                               action->handler, action->flags,
22760 +                               ipi_name[cpu], action->dev_id);
22761 +                       per_cpu(ipi_irq,cpu) = ret;
22762 +                       printk(KERN_INFO "register IPI_VECTOR (%s) to xen irq (%d)\n", ipi_name[cpu], ret);
22763 +                       break;
22764 +               case IA64_SPURIOUS_INT_VECTOR:
22765 +                       break;
22766 +               case IA64_CMC_VECTOR:
22767 +                       sprintf(cmc_name[cpu], "%s%d", action->name, cpu);
22768 +                       ret = bind_virq_to_irqhandler(VIRQ_MCA_CMC, cpu,
22769 +                                                     action->handler,
22770 +                                                     action->flags,
22771 +                                                     cmc_name[cpu],
22772 +                                                     action->dev_id);
22773 +                       per_cpu(cmc_irq,cpu) = ret;
22774 +                       printk(KERN_INFO "register VIRQ_MCA_CMC (%s) to xen "
22775 +                              "irq (%d)\n", cmc_name[cpu], ret);
22776 +                       break;
22777 +               case IA64_CMCP_VECTOR:
22778 +                       sprintf(cmcp_name[cpu], "%s%d", action->name, cpu);
22779 +                       ret = bind_ipi_to_irqhandler(CMCP_VECTOR, cpu,
22780 +                                                    action->handler,
22781 +                                                    action->flags,
22782 +                                                    cmcp_name[cpu],
22783 +                                                    action->dev_id);
22784 +                       per_cpu(cmcp_irq,cpu) = ret;
22785 +                       printk(KERN_INFO "register CMCP_VECTOR (%s) to xen "
22786 +                              "irq (%d)\n", cmcp_name[cpu], ret);
22787 +                       break;
22788 +               case IA64_CPEP_VECTOR:
22789 +                       sprintf(cpep_name[cpu], "%s%d", action->name, cpu);
22790 +                       ret = bind_ipi_to_irqhandler(CPEP_VECTOR, cpu,
22791 +                                                    action->handler,
22792 +                                                    action->flags,
22793 +                                                    cpep_name[cpu],
22794 +                                                    action->dev_id);
22795 +                       per_cpu(cpep_irq,cpu) = ret;
22796 +                       printk(KERN_INFO "register CPEP_VECTOR (%s) to xen "
22797 +                              "irq (%d)\n", cpep_name[cpu], ret);
22798 +                       break;
22799 +               case IA64_CPE_VECTOR:
22800 +                       printk(KERN_WARNING "register IA64_CPE_VECTOR "
22801 +                              "IGNORED\n");
22802 +                       break;
22803 +               default:
22804 +                       printk(KERN_WARNING "Percpu irq %d is unsupported by xen!\n", irq);
22805 +                       break;
22806 +               }
22807 +               BUG_ON(ret < 0);
22808 +       } 
22809 +
22810 +       /* For BSP, we cache registered percpu irqs, and then re-walk
22811 +        * them when initializing APs
22812 +        */
22813 +       if (!cpu && save) {
22814 +               BUG_ON(saved_irq_cnt == MAX_LATE_IRQ);
22815 +               saved_percpu_irqs[saved_irq_cnt].irq = irq;
22816 +               saved_percpu_irqs[saved_irq_cnt].action = action;
22817 +               saved_irq_cnt++;
22818 +               if (!xen_slab_ready)
22819 +                       late_irq_cnt++;
22820 +       }
22821 +}
22822 +
22823 +static void
22824 +xen_bind_early_percpu_irq (void)
22825 +{
22826 +       int i;
22827 +
22828 +       xen_slab_ready = 1;
22829 +       /* There's no race when accessing this cached array, since only
22830 +        * BSP will face with such step shortly
22831 +        */
22832 +       for (i = 0; i < late_irq_cnt; i++)
22833 +               xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22834 +                                       saved_percpu_irqs[i].action, 0);
22835 +}
22836 +
22837 +/* FIXME: There's no obvious point to check whether slab is ready. So
22838 + * a hack is used here by utilizing a late time hook.
22839 + */
22840 +extern void (*late_time_init)(void);
22841 +extern char xen_event_callback;
22842 +extern void xen_init_IRQ(void);
22843 +
22844 +#ifdef CONFIG_HOTPLUG_CPU
22845 +static int __devinit
22846 +unbind_evtchn_callback(struct notifier_block *nfb,
22847 +                       unsigned long action, void *hcpu)
22848 +{
22849 +       unsigned int cpu = (unsigned long)hcpu;
22850 +
22851 +       if (action == CPU_DEAD) {
22852 +               /* Unregister evtchn.  */
22853 +               if (per_cpu(cpep_irq,cpu) >= 0) {
22854 +                       unbind_from_irqhandler(per_cpu(cpep_irq, cpu), NULL);
22855 +                       per_cpu(cpep_irq, cpu) = -1;
22856 +               }
22857 +               if (per_cpu(cmcp_irq,cpu) >= 0) {
22858 +                       unbind_from_irqhandler(per_cpu(cmcp_irq, cpu), NULL);
22859 +                       per_cpu(cmcp_irq, cpu) = -1;
22860 +               }
22861 +               if (per_cpu(cmc_irq,cpu) >= 0) {
22862 +                       unbind_from_irqhandler(per_cpu(cmc_irq, cpu), NULL);
22863 +                       per_cpu(cmc_irq, cpu) = -1;
22864 +               }
22865 +               if (per_cpu(ipi_irq,cpu) >= 0) {
22866 +                       unbind_from_irqhandler (per_cpu(ipi_irq, cpu), NULL);
22867 +                       per_cpu(ipi_irq, cpu) = -1;
22868 +               }
22869 +               if (per_cpu(resched_irq,cpu) >= 0) {
22870 +                       unbind_from_irqhandler (per_cpu(resched_irq, cpu),
22871 +                                               NULL);
22872 +                       per_cpu(resched_irq, cpu) = -1;
22873 +               }
22874 +               if (per_cpu(timer_irq,cpu) >= 0) {
22875 +                       unbind_from_irqhandler (per_cpu(timer_irq, cpu), NULL);
22876 +                       per_cpu(timer_irq, cpu) = -1;
22877 +               }
22878 +       }
22879 +       return NOTIFY_OK;
22880 +}
22881 +
22882 +static struct notifier_block unbind_evtchn_notifier = {
22883 +       .notifier_call = unbind_evtchn_callback,
22884 +       .priority = 0
22885 +};
22886 +#endif
22887 +
22888 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
22889 +void xen_smp_intr_init(void)
22890 +{
22891 +#ifdef CONFIG_SMP
22892 +       unsigned int cpu = smp_processor_id();
22893 +       unsigned int i = 0;
22894 +       struct callback_register event = {
22895 +               .type = CALLBACKTYPE_event,
22896 +               .address = (unsigned long)&xen_event_callback,
22897 +       };
22898 +
22899 +       if (cpu == 0) {
22900 +               /* Initialization was already done for boot cpu.  */
22901 +#ifdef CONFIG_HOTPLUG_CPU
22902 +               /* Register the notifier only once.  */
22903 +               register_cpu_notifier(&unbind_evtchn_notifier);
22904 +#endif
22905 +               return;
22906 +       }
22907 +
22908 +       /* This should be piggyback when setup vcpu guest context */
22909 +       BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22910 +
22911 +       for (i = 0; i < saved_irq_cnt; i++)
22912 +               xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22913 +                                       saved_percpu_irqs[i].action, 0);
22914 +#endif /* CONFIG_SMP */
22915 +}
22916 +#endif /* CONFIG_XEN */
22917 +
22918  void
22919  register_percpu_irq (ia64_vector vec, struct irqaction *action)
22920  {
22921         irq_desc_t *desc;
22922         unsigned int irq;
22923  
22924 +#ifdef CONFIG_XEN
22925 +       if (is_running_on_xen())
22926 +               return xen_register_percpu_irq(vec, action, 1);
22927 +#endif
22928 +
22929         for (irq = 0; irq < NR_IRQS; ++irq)
22930                 if (irq_to_vector(irq) == vec) {
22931 +#ifdef CONFIG_XEN
22932 +                       if (is_running_on_xen())
22933 +                               return xen_register_percpu_irq(vec, action, 1);
22934 +#endif
22935                         desc = irq_desc + irq;
22936                         desc->status |= IRQ_PER_CPU;
22937                         desc->chip = &irq_type_ia64_lsapic;
22938 @@ -299,6 +573,21 @@ register_percpu_irq (ia64_vector vec, st
22939  void __init
22940  init_IRQ (void)
22941  {
22942 +#ifdef CONFIG_XEN
22943 +       /* Maybe put into platform_irq_init later */
22944 +       if (is_running_on_xen()) {
22945 +               struct callback_register event = {
22946 +                       .type = CALLBACKTYPE_event,
22947 +                       .address = (unsigned long)&xen_event_callback,
22948 +               };
22949 +               xen_init_IRQ();
22950 +               BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22951 +               late_time_init = xen_bind_early_percpu_irq;
22952 +#ifdef CONFIG_SMP
22953 +               register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
22954 +#endif /* CONFIG_SMP */
22955 +       }
22956 +#endif /* CONFIG_XEN */
22957         register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
22958  #ifdef CONFIG_SMP
22959         register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
22960 @@ -317,6 +606,45 @@ ia64_send_ipi (int cpu, int vector, int 
22961         unsigned long ipi_data;
22962         unsigned long phys_cpu_id;
22963  
22964 +#ifdef CONFIG_XEN
22965 +        if (is_running_on_xen()) {
22966 +               int irq = -1;
22967 +
22968 +#ifdef CONFIG_SMP
22969 +               /* TODO: we need to call vcpu_up here */
22970 +               if (unlikely(vector == ap_wakeup_vector)) {
22971 +                       extern void xen_send_ipi (int cpu, int vec);
22972 +                       xen_send_ipi (cpu, vector);
22973 +                       //vcpu_prepare_and_up(cpu);
22974 +                       return;
22975 +               }
22976 +#endif
22977 +
22978 +               switch(vector) {
22979 +               case IA64_IPI_VECTOR:
22980 +                       irq = per_cpu(ipi_to_irq, cpu)[IPI_VECTOR];
22981 +                       break;
22982 +               case IA64_IPI_RESCHEDULE:
22983 +                       irq = per_cpu(ipi_to_irq, cpu)[RESCHEDULE_VECTOR];
22984 +                       break;
22985 +               case IA64_CMCP_VECTOR:
22986 +                       irq = per_cpu(ipi_to_irq, cpu)[CMCP_VECTOR];
22987 +                       break;
22988 +               case IA64_CPEP_VECTOR:
22989 +                       irq = per_cpu(ipi_to_irq, cpu)[CPEP_VECTOR];
22990 +                       break;
22991 +               default:
22992 +                       printk(KERN_WARNING"Unsupported IPI type 0x%x\n", vector);
22993 +                       irq = 0;
22994 +                       break;
22995 +               }               
22996 +       
22997 +               BUG_ON(irq < 0);
22998 +               notify_remote_via_irq(irq);
22999 +               return;
23000 +        }
23001 +#endif /* CONFIG_XEN */
23002 +
23003  #ifdef CONFIG_SMP
23004         phys_cpu_id = cpu_physical_id(cpu);
23005  #else
23006 diff -ruNp linux-2.6.19/arch/ia64/kernel/pal.S linux-2.6.19-xen-3.0.4/arch/ia64/kernel/pal.S
23007 --- linux-2.6.19/arch/ia64/kernel/pal.S 2006-11-29 21:57:37.000000000 +0000
23008 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/pal.S       2007-02-02 19:10:21.000000000 +0000
23009 @@ -16,6 +16,7 @@
23010  #include <asm/processor.h>
23011  
23012         .data
23013 +       .globl pal_entry_point
23014  pal_entry_point:
23015         data8 ia64_pal_default_handler
23016         .text
23017 @@ -86,7 +87,7 @@ GLOBAL_ENTRY(ia64_pal_call_static)
23018         ;;
23019         srlz.d                          // seralize restoration of psr.l
23020         br.ret.sptk.many b0
23021 -END(ia64_pal_call_static)
23022 +END(__ia64_pal_call_static)
23023  
23024  /*
23025   * Make a PAL call using the stacked registers calling convention.
23026 diff -ruNp linux-2.6.19/arch/ia64/kernel/patch.c linux-2.6.19-xen-3.0.4/arch/ia64/kernel/patch.c
23027 --- linux-2.6.19/arch/ia64/kernel/patch.c       2006-11-29 21:57:37.000000000 +0000
23028 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/patch.c     2007-02-02 19:10:21.000000000 +0000
23029 @@ -184,6 +184,73 @@ patch_brl_fsys_bubble_down (unsigned lon
23030         ia64_srlz_i();
23031  }
23032  
23033 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
23034 +extern char __start_gate_running_on_xen_patchlist[];
23035 +extern char __end_gate_running_on_xen_patchlist[];
23036 +
23037 +void __init
23038 +patch_running_on_xen(unsigned long start, unsigned long end)
23039 +{
23040 +       extern int running_on_xen;
23041 +       s32 *offp = (s32 *)start;
23042 +       u64 ip;
23043 +
23044 +       while (offp < (s32 *)end) {
23045 +               ip = (u64)ia64_imva((char *)offp + *offp);
23046 +               ia64_patch_imm64(ip, (u64)&running_on_xen);
23047 +               ia64_fc((void *)ip);
23048 +               ++offp;
23049 +       }
23050 +       ia64_sync_i();
23051 +       ia64_srlz_i();
23052 +}
23053 +
23054 +static void __init
23055 +patch_brl_symaddr(unsigned long start, unsigned long end,
23056 +                  unsigned long symaddr)
23057 +{
23058 +       s32 *offp = (s32 *)start;
23059 +       u64 ip;
23060 +
23061 +       while (offp < (s32 *)end) {
23062 +               ip = (u64)offp + *offp;
23063 +               ia64_patch_imm60((u64)ia64_imva((void *)ip),
23064 +                                (u64)(symaddr - (ip & -16)) / 16);
23065 +               ia64_fc((void *)ip);
23066 +               ++offp;
23067 +       }
23068 +       ia64_sync_i();
23069 +       ia64_srlz_i();
23070 +}
23071 +
23072 +#define EXTERN_PATCHLIST(name)                                 \
23073 +       extern char __start_gate_brl_##name##_patchlist[];      \
23074 +       extern char __end_gate_brl_##name##_patchlist[];        \
23075 +       extern char name[]
23076 +
23077 +#define PATCH_BRL_SYMADDR(name)                                                \
23078 +       patch_brl_symaddr((unsigned long)__start_gate_brl_##name##_patchlist, \
23079 +                         (unsigned long)__end_gate_brl_##name##_patchlist,   \
23080 +                         (unsigned long)name)
23081 +
23082 +static void __init
23083 +patch_brl_in_vdso(void)
23084 +{
23085 +       EXTERN_PATCHLIST(xen_rsm_be_i);
23086 +       EXTERN_PATCHLIST(xen_get_psr);
23087 +       EXTERN_PATCHLIST(xen_ssm_i_0);
23088 +       EXTERN_PATCHLIST(xen_ssm_i_1);
23089 +
23090 +       PATCH_BRL_SYMADDR(xen_rsm_be_i);
23091 +       PATCH_BRL_SYMADDR(xen_get_psr);
23092 +       PATCH_BRL_SYMADDR(xen_ssm_i_0);
23093 +       PATCH_BRL_SYMADDR(xen_ssm_i_1);
23094 +}
23095 +#else
23096 +#define patch_running_on_xen(start, end)       do { } while (0)
23097 +#define patch_brl_in_vdso()                    do { } while (0)
23098 +#endif
23099 +
23100  void __init
23101  ia64_patch_gate (void)
23102  {
23103 @@ -192,6 +259,10 @@ ia64_patch_gate (void)
23104  
23105         patch_fsyscall_table(START(fsyscall), END(fsyscall));
23106         patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
23107 +#ifdef CONFIG_XEN
23108 +       patch_running_on_xen(START(running_on_xen), END(running_on_xen));
23109 +       patch_brl_in_vdso();
23110 +#endif
23111         ia64_patch_vtop(START(vtop), END(vtop));
23112         ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
23113  }
23114 diff -ruNp linux-2.6.19/arch/ia64/kernel/perfmon.c linux-2.6.19-xen-3.0.4/arch/ia64/kernel/perfmon.c
23115 --- linux-2.6.19/arch/ia64/kernel/perfmon.c     2006-11-29 21:57:37.000000000 +0000
23116 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/perfmon.c   2007-02-02 19:10:21.000000000 +0000
23117 @@ -53,6 +53,28 @@
23118  #include <asm/delay.h>
23119  
23120  #ifdef CONFIG_PERFMON
23121 +#ifdef CONFIG_XEN
23122 +//#include <xen/xenoprof.h>
23123 +#include <xen/interface/xenoprof.h>
23124 +
23125 +static int xenoprof_is_primary = 0;
23126 +#define init_xenoprof_primary(is_primary)  (xenoprof_is_primary = (is_primary))
23127 +#define is_xenoprof_primary()  (xenoprof_is_primary)
23128 +#define XEN_NOT_SUPPORTED_YET                                          \
23129 +       do {                                                            \
23130 +               if (is_running_on_xen()) {                              \
23131 +                       printk("%s is not supported yet under xen.\n",  \
23132 +                              __func__);                               \
23133 +                       return -ENOSYS;                                 \
23134 +               }                                                       \
23135 +       } while (0)
23136 +#else
23137 +#define init_xenoprof_primary(is_primary)      do { } while (0)
23138 +#define is_xenoprof_primary()                  (0)
23139 +#define XEN_NOT_SUPPORTED_YET                  do { } while (0)
23140 +#define HYPERVISOR_perfmon_op(cmd, arg, count) do { } while (0)
23141 +#endif
23142 +
23143  /*
23144   * perfmon context state
23145   */
23146 @@ -1517,6 +1539,7 @@ pfm_read(struct file *filp, char __user 
23147         ssize_t ret;
23148         unsigned long flags;
23149         DECLARE_WAITQUEUE(wait, current);
23150 +       XEN_NOT_SUPPORTED_YET;
23151         if (PFM_IS_FILE(filp) == 0) {
23152                 printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
23153                 return -EINVAL;
23154 @@ -2115,6 +2138,15 @@ doit:
23155          */
23156         if (free_possible) pfm_context_free(ctx);
23157  
23158 +       if (is_running_on_xen()) {
23159 +               if (is_xenoprof_primary()) {
23160 +                       int ret = HYPERVISOR_perfmon_op(PFM_DESTROY_CONTEXT,
23161 +                                                       NULL, 0);
23162 +                       if (ret)
23163 +                               printk("%s:%d PFM_DESTROY_CONTEXT hypercall "
23164 +                                      "failed\n", __func__, __LINE__);
23165 +               }
23166 +       }
23167         return 0;
23168  }
23169  
23170 @@ -2738,6 +2770,23 @@ pfm_context_create(pfm_context_t *ctx, v
23171          */
23172         pfm_reset_pmu_state(ctx);
23173  
23174 +       if (is_running_on_xen()) {
23175 +               /*
23176 +                * kludge to get xenoprof.is_primary.
23177 +                * XENOPROF_init/ia64 is nop. so it is safe to call it here.
23178 +                */
23179 +               struct xenoprof_init init;
23180 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
23181 +               if (ret)
23182 +                       goto buffer_error;
23183 +               init_xenoprof_primary(init.is_primary);
23184 +
23185 +               if (is_xenoprof_primary()) {
23186 +                       ret = HYPERVISOR_perfmon_op(PFM_CREATE_CONTEXT, arg, 0);
23187 +                       if (ret)
23188 +                               goto buffer_error;
23189 +               }
23190 +       }
23191         return 0;
23192  
23193  buffer_error:
23194 @@ -2873,6 +2922,12 @@ pfm_write_pmcs(pfm_context_t *ctx, void 
23195         pfm_reg_check_t wr_func;
23196  #define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
23197  
23198 +       if (is_running_on_xen()) {
23199 +               if (is_xenoprof_primary())
23200 +                       return HYPERVISOR_perfmon_op(PFM_WRITE_PMCS,
23201 +                                                    arg, count);
23202 +               return 0;
23203 +       }
23204         state     = ctx->ctx_state;
23205         is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
23206         is_system = ctx->ctx_fl_system;
23207 @@ -3111,6 +3166,12 @@ pfm_write_pmds(pfm_context_t *ctx, void 
23208         int ret = -EINVAL;
23209         pfm_reg_check_t wr_func;
23210  
23211 +       if (is_running_on_xen()) {
23212 +               if (is_xenoprof_primary())
23213 +                       return HYPERVISOR_perfmon_op(PFM_WRITE_PMDS,
23214 +                                                    arg, count);
23215 +               return 0;
23216 +       }
23217  
23218         state     = ctx->ctx_state;
23219         is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
23220 @@ -3306,6 +3367,7 @@ pfm_read_pmds(pfm_context_t *ctx, void *
23221         int is_loaded, is_system, is_counting, expert_mode;
23222         int ret = -EINVAL;
23223         pfm_reg_check_t rd_func;
23224 +       XEN_NOT_SUPPORTED_YET;
23225  
23226         /*
23227          * access is possible when loaded only for
23228 @@ -3556,6 +3618,7 @@ pfm_restart(pfm_context_t *ctx, void *ar
23229         pfm_ovfl_ctrl_t rst_ctrl;
23230         int state, is_system;
23231         int ret = 0;
23232 +       XEN_NOT_SUPPORTED_YET;
23233  
23234         state     = ctx->ctx_state;
23235         fmt       = ctx->ctx_buf_fmt;
23236 @@ -3705,6 +3768,7 @@ static int
23237  pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
23238  {
23239         unsigned int m = *(unsigned int *)arg;
23240 +       XEN_NOT_SUPPORTED_YET;
23241  
23242         pfm_sysctl.debug = m == 0 ? 0 : 1;
23243  
23244 @@ -3975,6 +4039,8 @@ pfm_get_features(pfm_context_t *ctx, voi
23245  {
23246         pfarg_features_t *req = (pfarg_features_t *)arg;
23247  
23248 +       if (is_running_on_xen())
23249 +               return HYPERVISOR_perfmon_op(PFM_GET_FEATURES, &arg, 0);
23250         req->ft_version = PFM_VERSION;
23251         return 0;
23252  }
23253 @@ -3986,6 +4052,12 @@ pfm_stop(pfm_context_t *ctx, void *arg, 
23254         struct task_struct *task = PFM_CTX_TASK(ctx);
23255         int state, is_system;
23256  
23257 +       if (is_running_on_xen()) {
23258 +               if (is_xenoprof_primary())
23259 +                       return HYPERVISOR_perfmon_op(PFM_STOP, NULL, 0);
23260 +               return 0;
23261 +       }
23262 +
23263         state     = ctx->ctx_state;
23264         is_system = ctx->ctx_fl_system;
23265  
23266 @@ -4074,6 +4146,11 @@ pfm_start(pfm_context_t *ctx, void *arg,
23267         struct pt_regs *tregs;
23268         int state, is_system;
23269  
23270 +       if (is_running_on_xen()) {
23271 +               if (is_xenoprof_primary())
23272 +                       return HYPERVISOR_perfmon_op(PFM_START, NULL, 0);
23273 +               return 0;
23274 +       }
23275         state     = ctx->ctx_state;
23276         is_system = ctx->ctx_fl_system;
23277  
23278 @@ -4156,6 +4233,7 @@ pfm_get_pmc_reset(pfm_context_t *ctx, vo
23279         unsigned int cnum;
23280         int i;
23281         int ret = -EINVAL;
23282 +       XEN_NOT_SUPPORTED_YET;
23283  
23284         for (i = 0; i < count; i++, req++) {
23285  
23286 @@ -4214,6 +4292,11 @@ pfm_context_load(pfm_context_t *ctx, voi
23287         int ret = 0;
23288         int state, is_system, set_dbregs = 0;
23289  
23290 +       if (is_running_on_xen()) {
23291 +               if (is_xenoprof_primary())
23292 +                       return HYPERVISOR_perfmon_op(PFM_LOAD_CONTEXT, arg, 0);
23293 +               return 0;
23294 +       }
23295         state     = ctx->ctx_state;
23296         is_system = ctx->ctx_fl_system;
23297         /*
23298 @@ -4462,6 +4545,12 @@ pfm_context_unload(pfm_context_t *ctx, v
23299         int prev_state, is_system;
23300         int ret;
23301  
23302 +       if (is_running_on_xen()) {
23303 +               if (is_xenoprof_primary())
23304 +                       return HYPERVISOR_perfmon_op(PFM_UNLOAD_CONTEXT,
23305 +                                                    NULL, 0);
23306 +               return 0;
23307 +       }
23308         DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
23309  
23310         prev_state = ctx->ctx_state;
23311 diff -ruNp linux-2.6.19/arch/ia64/kernel/setup.c linux-2.6.19-xen-3.0.4/arch/ia64/kernel/setup.c
23312 --- linux-2.6.19/arch/ia64/kernel/setup.c       2006-11-29 21:57:37.000000000 +0000
23313 +++ linux-2.6.19-xen-3.0.4/arch/ia64/kernel/setup.c     2007-02-02 19:10:21.000000000 +0000
23314 @@ -59,6 +59,11 @@
23315  #include <asm/system.h>
23316  #include <asm/unistd.h>
23317  #include <asm/system.h>
23318 +#ifdef CONFIG_XEN
23319 +#include <asm/hypervisor.h>
23320 +#include <asm/xen/xencomm.h>
23321 +#endif
23322 +#include <linux/dma-mapping.h>
23323  
23324  #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
23325  # error "struct cpuinfo_ia64 too big!"
23326 @@ -69,6 +74,22 @@ unsigned long __per_cpu_offset[NR_CPUS];
23327  EXPORT_SYMBOL(__per_cpu_offset);
23328  #endif
23329  
23330 +#ifdef CONFIG_XEN
23331 +static int
23332 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
23333 +{
23334 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
23335 +       /* we're never actually going to get here... */
23336 +       return NOTIFY_DONE;
23337 +}
23338 +
23339 +static struct notifier_block xen_panic_block = {
23340 +       .notifier_call  = xen_panic_event,
23341 +       .next           = NULL,
23342 +       .priority       = 0     /* try to go last */
23343 +};
23344 +#endif
23345 +
23346  extern void ia64_setup_printk_clock(void);
23347  
23348  DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
23349 @@ -175,15 +196,33 @@ filter_rsvd_memory (unsigned long start,
23350         return 0;
23351  }
23352  
23353 +static int __init
23354 +rsvd_region_cmp(struct rsvd_region *lhs, struct rsvd_region *rhs)
23355 +{
23356 +       if (lhs->start > rhs->start)
23357 +               return 1;
23358 +       if (lhs->start < rhs->start)
23359 +               return -1;
23360 +
23361 +       if (lhs->end > rhs->end)
23362 +               return 1;
23363 +       if (lhs->end < rhs->end)
23364 +               return -1;
23365 +
23366 +       return 0;
23367 +}
23368 +
23369  static void __init
23370  sort_regions (struct rsvd_region *rsvd_region, int max)
23371  {
23372 +       int num = max;
23373         int j;
23374  
23375         /* simple bubble sorting */
23376         while (max--) {
23377                 for (j = 0; j < max; ++j) {
23378 -                       if (rsvd_region[j].start > rsvd_region[j+1].start) {
23379 +                       if (rsvd_region_cmp(&rsvd_region[j],
23380 +                                           &rsvd_region[j + 1]) > 0) {
23381                                 struct rsvd_region tmp;
23382                                 tmp = rsvd_region[j];
23383                                 rsvd_region[j] = rsvd_region[j + 1];
23384 @@ -191,6 +230,36 @@ sort_regions (struct rsvd_region *rsvd_r
23385                         }
23386                 }
23387         }
23388 +
23389 +       for (j = 0; j < num - 1; j++) {
23390 +               int k;
23391 +               unsigned long start = rsvd_region[j].start;
23392 +               unsigned long end = rsvd_region[j].end;
23393 +               int collapsed;
23394 +               
23395 +               for (k = j + 1; k < num; k++) {
23396 +                       BUG_ON(start > rsvd_region[k].start);
23397 +                       if (end < rsvd_region[k].start) {
23398 +                               k--;
23399 +                               break;
23400 +                       }
23401 +                       end = max(end, rsvd_region[k].end);
23402 +               }
23403 +               if (k == num)
23404 +                       k--;
23405 +               rsvd_region[j].end = end;
23406 +               collapsed = k - j;
23407 +               num -= collapsed;
23408 +               for (k = j + 1; k < num; k++) {
23409 +                       rsvd_region[k] = rsvd_region[k + collapsed];
23410 +               }
23411 +       }
23412 +
23413 +       num_rsvd_regions = num;
23414 +       for (j = 0; j < num; j++) {
23415 +               printk("rsvd_region[%d]: [0x%016lx, 0x%06lx)\n",
23416 +                      j, rsvd_region[j].start, rsvd_region[j].end);
23417 +       }
23418  }
23419  
23420  /*
23421 @@ -241,6 +310,14 @@ reserve_memory (void)
23422         rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
23423         n++;
23424  
23425 +#ifdef CONFIG_XEN
23426 +       if (is_running_on_xen()) {
23427 +               rsvd_region[n].start = (unsigned long)__va((HYPERVISOR_shared_info->arch.start_info_pfn << PAGE_SHIFT));
23428 +               rsvd_region[n].end   = rsvd_region[n].start + PAGE_SIZE;
23429 +               n++;
23430 +       }
23431 +#endif
23432 +
23433  #ifdef CONFIG_BLK_DEV_INITRD
23434         if (ia64_boot_param->initrd_start) {
23435                 rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
23436 @@ -332,6 +409,16 @@ early_console_setup (char *cmdline)
23437  {
23438         int earlycons = 0;
23439  
23440 +#ifdef CONFIG_XEN
23441 +#ifndef CONFIG_IA64_HP_SIM
23442 +       if (is_running_on_xen()) {
23443 +               extern struct console hpsim_cons;
23444 +               hpsim_cons.flags |= CON_BOOT;
23445 +               register_console(&hpsim_cons);
23446 +               earlycons++;
23447 +       }
23448 +#endif
23449 +#endif
23450  #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
23451         {
23452                 extern int sn_serial_console_early_setup(void);
23453 @@ -401,6 +488,17 @@ setup_arch (char **cmdline_p)
23454  {
23455         unw_init();
23456  
23457 +#ifdef CONFIG_XEN
23458 +       if (is_running_on_xen()) {
23459 +               /* Must be done before any hypercall.  */
23460 +               xencomm_init();
23461 +
23462 +               setup_xen_features();
23463 +               /* Register a call for panic conditions. */
23464 +               atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
23465 +       }
23466 +#endif
23467 +
23468         ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
23469  
23470         *cmdline_p = __va(ia64_boot_param->command_line);
23471 @@ -479,7 +577,26 @@ setup_arch (char **cmdline_p)
23472                         conswitchp = &vga_con;
23473  # endif
23474         }
23475 +#ifdef CONFIG_XEN
23476 +       if (is_running_on_xen()) {
23477 +               shared_info_t *s = HYPERVISOR_shared_info;
23478 +
23479 +               xen_start_info = __va(s->arch.start_info_pfn << PAGE_SHIFT);
23480 +
23481 +               printk("Running on Xen! start_info_pfn=0x%lx nr_pages=%ld "
23482 +                      "flags=0x%x\n", s->arch.start_info_pfn,
23483 +                      xen_start_info->nr_pages, xen_start_info->flags);
23484 +
23485 +               if (!is_initial_xendomain()) {
23486 +#if !defined(CONFIG_VT) || !defined(CONFIG_DUMMY_CONSOLE)
23487 +                       conswitchp = NULL;
23488  #endif
23489 +               }
23490 +       }
23491 +       xencons_early_setup();
23492 +#endif
23493 +#endif
23494 +
23495  
23496         /* enable IA-64 Machine Check Abort Handling unless disabled */
23497         if (!nomca)
23498 @@ -487,6 +604,9 @@ setup_arch (char **cmdline_p)
23499  
23500         platform_setup(cmdline_p);
23501         paging_init();
23502 +#ifdef CONFIG_XEN
23503 +       contiguous_bitmap_init(max_pfn);
23504 +#endif
23505  }
23506  
23507  /*
23508 @@ -892,6 +1012,15 @@ cpu_init (void)
23509         /* size of physical stacked register partition plus 8 bytes: */
23510         __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
23511         platform_cpu_init();
23512 +
23513 +#ifdef CONFIG_XEN
23514 +       /* Need to be moved into platform_cpu_init later */
23515 +       if (is_running_on_xen()) {
23516 +               extern void xen_smp_intr_init(void);
23517 +               xen_smp_intr_init();
23518 +       }
23519 +#endif
23520 +
23521         pm_idle = default_idle;
23522  }
23523  
23524 diff -ruNp linux-2.6.19/arch/ia64/mm/ioremap.c linux-2.6.19-xen-3.0.4/arch/ia64/mm/ioremap.c
23525 --- linux-2.6.19/arch/ia64/mm/ioremap.c 2006-11-29 21:57:37.000000000 +0000
23526 +++ linux-2.6.19-xen-3.0.4/arch/ia64/mm/ioremap.c       2007-02-02 19:10:21.000000000 +0000
23527 @@ -16,6 +16,9 @@
23528  static inline void __iomem *
23529  __ioremap (unsigned long offset, unsigned long size)
23530  {
23531 +#ifdef CONFIG_XEN
23532 +       offset = HYPERVISOR_ioremap(offset, size);
23533 +#endif
23534         return (void __iomem *) (__IA64_UNCACHED_OFFSET | offset);
23535  }
23536  
23537 diff -ruNp linux-2.6.19/arch/ia64/oprofile/Makefile linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/Makefile
23538 --- linux-2.6.19/arch/ia64/oprofile/Makefile    2006-11-29 21:57:37.000000000 +0000
23539 +++ linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/Makefile  2007-02-02 19:10:21.000000000 +0000
23540 @@ -8,3 +8,7 @@ DRIVER_OBJS := $(addprefix ../../../driv
23541  
23542  oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
23543  oprofile-$(CONFIG_PERFMON) += perfmon.o
23544 +ifeq ($(CONFIG_XEN), y)
23545 +oprofile-$(CONFIG_PERFMON) += xenoprof.o \
23546 +       ../../../drivers/xen/xenoprof/xenoprofile.o
23547 +endif
23548 diff -ruNp linux-2.6.19/arch/ia64/oprofile/init.c linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/init.c
23549 --- linux-2.6.19/arch/ia64/oprofile/init.c      2006-11-29 21:57:37.000000000 +0000
23550 +++ linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/init.c    2007-02-02 19:10:21.000000000 +0000
23551 @@ -11,6 +11,7 @@
23552  #include <linux/oprofile.h>
23553  #include <linux/init.h>
23554  #include <linux/errno.h>
23555 +#include "oprofile_perfmon.h"
23556   
23557  extern int perfmon_init(struct oprofile_operations * ops);
23558  extern void perfmon_exit(void);
23559 @@ -20,6 +21,13 @@ int __init oprofile_arch_init(struct opr
23560  {
23561         int ret = -ENODEV;
23562  
23563 +       if (is_running_on_xen()) {
23564 +               ret = xen_perfmon_init();
23565 +               if (ret)
23566 +                       return ret;
23567 +               return xenoprofile_init(ops);
23568 +       }
23569 +
23570  #ifdef CONFIG_PERFMON
23571         /* perfmon_init() can fail, but we have no way to report it */
23572         ret = perfmon_init(ops);
23573 @@ -32,6 +40,12 @@ int __init oprofile_arch_init(struct opr
23574  
23575  void oprofile_arch_exit(void)
23576  {
23577 +       if (is_running_on_xen()) {
23578 +               xenoprofile_exit();
23579 +               xen_perfmon_exit();
23580 +               return;
23581 +       }
23582 +
23583  #ifdef CONFIG_PERFMON
23584         perfmon_exit();
23585  #endif
23586 diff -ruNp linux-2.6.19/arch/ia64/oprofile/oprofile_perfmon.h linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/oprofile_perfmon.h
23587 --- linux-2.6.19/arch/ia64/oprofile/oprofile_perfmon.h  1970-01-01 00:00:00.000000000 +0000
23588 +++ linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/oprofile_perfmon.h        2007-02-02 19:10:21.000000000 +0000
23589 @@ -0,0 +1,30 @@
23590 +#ifndef OPROFILE_PERFMON_H
23591 +#define OPROFILE_PERFMON_H
23592 +
23593 +#include <linux/config.h>
23594 +
23595 +#ifdef CONFIG_PERFMON
23596 +int __perfmon_init(void);
23597 +void __perfmon_exit(void);
23598 +int perfmon_start(void);
23599 +void perfmon_stop(void);
23600 +#else
23601 +#define __perfmon_init()       (-ENOSYS)
23602 +#define __perfmon_exit()       do {} while (0)
23603 +#endif /* CONFIG_PERFMON */
23604 +
23605 +#ifdef CONFIG_XEN
23606 +#define STATIC_IF_NO_XEN       /* nothing */
23607 +#define xen_perfmon_init()     __perfmon_init()
23608 +#define xen_perfmon_exit()     __perfmon_exit()
23609 +extern int xenoprofile_init(struct oprofile_operations * ops);
23610 +extern void xenoprofile_exit(void);
23611 +#else
23612 +#define STATIC_IF_NO_XEN       static
23613 +#define xen_perfmon_init()     (-ENOSYS)
23614 +#define xen_perfmon_exit()     do {} while (0)
23615 +#define xenoprofile_init()     (-ENOSYS)
23616 +#define xenoprofile_exit()     do {} while (0)
23617 +#endif /* CONFIG_XEN */
23618 +
23619 +#endif /* OPROFILE_PERFMON_H */
23620 diff -ruNp linux-2.6.19/arch/ia64/oprofile/perfmon.c linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/perfmon.c
23621 --- linux-2.6.19/arch/ia64/oprofile/perfmon.c   2006-11-29 21:57:37.000000000 +0000
23622 +++ linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/perfmon.c 2007-02-02 19:10:21.000000000 +0000
23623 @@ -13,6 +13,7 @@
23624  #include <asm/perfmon.h>
23625  #include <asm/ptrace.h>
23626  #include <asm/errno.h>
23627 +#include "oprofile_perfmon.h"
23628  
23629  static int allow_ints;
23630  
23631 @@ -33,14 +34,16 @@ perfmon_handler(struct task_struct *task
23632  }
23633  
23634  
23635 -static int perfmon_start(void)
23636 +STATIC_IF_NO_XEN
23637 +int perfmon_start(void)
23638  {
23639         allow_ints = 1;
23640         return 0;
23641  }
23642  
23643  
23644 -static void perfmon_stop(void)
23645 +STATIC_IF_NO_XEN
23646 +void perfmon_stop(void)
23647  {
23648         allow_ints = 0;
23649  }
23650 @@ -75,16 +78,35 @@ static char * get_cpu_type(void)
23651  
23652  static int using_perfmon;
23653  
23654 -int perfmon_init(struct oprofile_operations * ops)
23655 +STATIC_IF_NO_XEN
23656 +int __perfmon_init(void)
23657  {
23658         int ret = pfm_register_buffer_fmt(&oprofile_fmt);
23659         if (ret)
23660                 return -ENODEV;
23661  
23662 +       using_perfmon = 1;
23663 +       return 0;
23664 +}
23665 +
23666 +STATIC_IF_NO_XEN
23667 +void __perfmon_exit(void)
23668 +{
23669 +       if (!using_perfmon)
23670 +               return;
23671 +
23672 +       pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
23673 +}
23674 +
23675 +int perfmon_init(struct oprofile_operations * ops)
23676 +{
23677 +       int ret = __perfmon_init();
23678 +       if (ret)
23679 +               return -ENODEV;
23680 +
23681         ops->cpu_type = get_cpu_type();
23682         ops->start = perfmon_start;
23683         ops->stop = perfmon_stop;
23684 -       using_perfmon = 1;
23685         printk(KERN_INFO "oprofile: using perfmon.\n");
23686         return 0;
23687  }
23688 @@ -92,8 +114,5 @@ int perfmon_init(struct oprofile_operati
23689  
23690  void perfmon_exit(void)
23691  {
23692 -       if (!using_perfmon)
23693 -               return;
23694 -
23695 -       pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
23696 +       __perfmon_exit();
23697  }
23698 diff -ruNp linux-2.6.19/arch/ia64/oprofile/xenoprof.c linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/xenoprof.c
23699 --- linux-2.6.19/arch/ia64/oprofile/xenoprof.c  1970-01-01 00:00:00.000000000 +0000
23700 +++ linux-2.6.19-xen-3.0.4/arch/ia64/oprofile/xenoprof.c        2007-02-02 19:10:21.000000000 +0000
23701 @@ -0,0 +1,142 @@
23702 +/******************************************************************************
23703 + * xenoprof ia64 specific part
23704 + *
23705 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
23706 + *                    VA Linux Systems Japan K.K.
23707 + *
23708 + * This program is free software; you can redistribute it and/or modify
23709 + * it under the terms of the GNU General Public License as published by
23710 + * the Free Software Foundation; either version 2 of the License, or
23711 + * (at your option) any later version.
23712 + *
23713 + * This program is distributed in the hope that it will be useful,
23714 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
23715 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23716 + * GNU General Public License for more details.
23717 + *
23718 + * You should have received a copy of the GNU General Public License
23719 + * along with this program; if not, write to the Free Software
23720 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23721 + *
23722 + */
23723 +#include <linux/init.h>
23724 +#include <linux/oprofile.h>
23725 +#include <linux/ioport.h>
23726 +
23727 +#include <xen/driver_util.h>
23728 +#include <xen/interface/xen.h>
23729 +#include <xen/interface/xenoprof.h>
23730 +#include <xen/xenoprof.h>
23731 +
23732 +#include "oprofile_perfmon.h"
23733 +
23734 +void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
23735 +{
23736 +       init->num_events = 0; /* perfmon manages. */
23737 +}
23738 +
23739 +void xenoprof_arch_counter(void)
23740 +{
23741 +       /* nothing. perfmon does. */
23742 +}
23743 +
23744 +void xenoprof_arch_start(void) 
23745 +{
23746 +       perfmon_start();
23747 +}
23748 +
23749 +void xenoprof_arch_stop(void)
23750 +{
23751 +       perfmon_stop();
23752 +}
23753 +
23754 +/* XXX move them to an appropriate header file. */
23755 +struct resource* xen_ia64_allocate_resource(unsigned long size); 
23756 +void xen_ia64_release_resource(struct resource* res); 
23757 +void xen_ia64_unmap_resource(struct resource* res); 
23758 +
23759 +struct resource*
23760 +xenoprof_ia64_allocate_resource(int32_t max_samples)
23761 +{
23762 +       unsigned long bufsize;
23763 +
23764 +       /* XXX add hypercall to get bufsize? */
23765 +       /*     this value is taken from alloc_xenoprof_struct(). */
23766 +#if 0
23767 +       bufsize = NR_CPUS * (sizeof(struct xenoprof_buf) +
23768 +                            (max_samples - 1) * sizeof(struct event_log));
23769 +       bufsize = PAGE_ALIGN(bufsize) + PAGE_SIZE;
23770 +#else
23771 +#define MAX_OPROF_SHARED_PAGES 32
23772 +       bufsize = (MAX_OPROF_SHARED_PAGES + 1) * PAGE_SIZE;
23773 +#endif
23774 +       return xen_ia64_allocate_resource(bufsize);
23775 +}
23776 +
23777 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf)
23778 +{
23779 +       if (sbuf->buffer) {
23780 +               xen_ia64_unmap_resource(sbuf->arch.res);
23781 +               sbuf->buffer = NULL;
23782 +               sbuf->arch.res = NULL;
23783 +       }
23784 +}
23785 +
23786 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer,
23787 +                                    struct xenoprof_shared_buffer* sbuf)
23788 +{
23789 +       int ret;
23790 +       struct resource* res;
23791 +
23792 +       sbuf->buffer = NULL;
23793 +       sbuf->arch.res = NULL;
23794 +
23795 +       res = xenoprof_ia64_allocate_resource(get_buffer->max_samples);
23796 +       if (IS_ERR(res))
23797 +               return PTR_ERR(res);
23798 +
23799 +       get_buffer->buf_gmaddr = res->start;
23800 +
23801 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer);
23802 +       if (ret) {
23803 +               xen_ia64_release_resource(res);
23804 +               return ret;
23805 +       }
23806 +
23807 +       BUG_ON((res->end - res->start + 1) <
23808 +              get_buffer->bufsize * get_buffer->nbuf);
23809 +
23810 +       sbuf->buffer = __va(res->start);
23811 +       sbuf->arch.res = res;
23812 +
23813 +       return ret;
23814 +}
23815 +
23816 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain,
23817 +                              struct xenoprof_shared_buffer* sbuf)
23818 +{
23819 +       int ret;
23820 +       struct resource* res;
23821 +
23822 +       sbuf->buffer = NULL;
23823 +       sbuf->arch.res = NULL;
23824 +
23825 +       res = xenoprof_ia64_allocate_resource(pdomain->max_samples);
23826 +       if (IS_ERR(res))
23827 +               return PTR_ERR(res);
23828 +
23829 +       pdomain->buf_gmaddr = res->start;
23830 +
23831 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
23832 +       if (ret) {
23833 +               xen_ia64_release_resource(res);
23834 +               return ret;
23835 +       }
23836 +
23837 +       BUG_ON((res->end - res->start + 1) < pdomain->bufsize * pdomain->nbuf);
23838 +
23839 +       sbuf->buffer = __va(res->start);
23840 +       sbuf->arch.res = res;
23841 +
23842 +       return ret;
23843 +}
23844 diff -ruNp linux-2.6.19/arch/ia64/xen/Makefile linux-2.6.19-xen-3.0.4/arch/ia64/xen/Makefile
23845 --- linux-2.6.19/arch/ia64/xen/Makefile 1970-01-01 00:00:00.000000000 +0000
23846 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/Makefile       2007-02-02 19:10:21.000000000 +0000
23847 @@ -0,0 +1,9 @@
23848 +#
23849 +# Makefile for Xen components
23850 +#
23851 +
23852 +obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o \
23853 +        hypervisor.o pci-dma-xen.o util.o xencomm.o xcom_hcall.o \
23854 +        xcom_mini.o xcom_privcmd.o mem.o
23855 +
23856 +pci-dma-xen-y := ../../i386/kernel/pci-dma-xen.o
23857 diff -ruNp linux-2.6.19/arch/ia64/xen/drivers/README linux-2.6.19-xen-3.0.4/arch/ia64/xen/drivers/README
23858 --- linux-2.6.19/arch/ia64/xen/drivers/README   1970-01-01 00:00:00.000000000 +0000
23859 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/drivers/README 2007-02-02 19:10:21.000000000 +0000
23860 @@ -0,0 +1,2 @@
23861 +This is a temporary location for source/Makefiles that need to be
23862 +patched/reworked in drivers/xen to work with xenlinux/ia64.
23863 diff -ruNp linux-2.6.19/arch/ia64/xen/hypercall.S linux-2.6.19-xen-3.0.4/arch/ia64/xen/hypercall.S
23864 --- linux-2.6.19/arch/ia64/xen/hypercall.S      1970-01-01 00:00:00.000000000 +0000
23865 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/hypercall.S    2007-02-02 19:10:21.000000000 +0000
23866 @@ -0,0 +1,411 @@
23867 +/*
23868 + * Support routines for Xen hypercalls
23869 + *
23870 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
23871 + */
23872 +
23873 +#include <asm/processor.h>
23874 +#include <asm/asmmacro.h>
23875 +
23876 +/* To clear vpsr.ic, vpsr.i needs to be cleared first */
23877 +#define XEN_CLEAR_PSR_IC                               \
23878 +       mov r14=1;                                      \
23879 +       movl r15=XSI_PSR_I_ADDR;                        \
23880 +       movl r2=XSI_PSR_IC;                             \
23881 +       ;;                                              \
23882 +       ld8 r15=[r15];                                  \
23883 +       ld4 r3=[r2];                                    \
23884 +       ;;                                              \
23885 +       ld1 r16=[r15];                                  \
23886 +       ;;                                              \
23887 +       st1 [r15]=r14;                                  \
23888 +       st4 [r2]=r0;                                    \
23889 +       ;;
23890 +
23891 +/* First restore vpsr.ic, and then vpsr.i */
23892 +#define XEN_RESTORE_PSR_IC                             \
23893 +       st4 [r2]=r3;                                    \
23894 +       st1 [r15]=r16;                                  \
23895 +       ;;
23896 +
23897 +GLOBAL_ENTRY(xen_get_ivr)
23898 +       movl r8=running_on_xen;;
23899 +       ld4 r8=[r8];;
23900 +       cmp.eq p7,p0=r8,r0;;
23901 +(p7)   mov r8=cr.ivr;;
23902 +(p7)   br.ret.sptk.many rp
23903 +       ;;
23904 +       XEN_CLEAR_PSR_IC
23905 +       ;;
23906 +       XEN_HYPER_GET_IVR
23907 +       ;;
23908 +       XEN_RESTORE_PSR_IC
23909 +       ;;
23910 +       br.ret.sptk.many rp
23911 +       ;;
23912 +END(xen_get_ivr)
23913 +
23914 +GLOBAL_ENTRY(xen_get_tpr)
23915 +       movl r8=running_on_xen;;
23916 +       ld4 r8=[r8];;
23917 +       cmp.eq p7,p0=r8,r0;;
23918 +(p7)   mov r8=cr.tpr;;
23919 +(p7)   br.ret.sptk.many rp
23920 +       ;;
23921 +       XEN_CLEAR_PSR_IC
23922 +       ;;
23923 +       XEN_HYPER_GET_TPR
23924 +       ;;
23925 +       XEN_RESTORE_PSR_IC
23926 +       ;;
23927 +       br.ret.sptk.many rp
23928 +       ;;
23929 +END(xen_get_tpr)
23930 +
23931 +GLOBAL_ENTRY(xen_set_tpr)
23932 +       movl r8=running_on_xen;;
23933 +       ld4 r8=[r8];;
23934 +       cmp.eq p7,p0=r8,r0;;
23935 +(p7)   mov cr.tpr=r32;;
23936 +(p7)   br.ret.sptk.many rp
23937 +       ;;
23938 +       mov r8=r32
23939 +       ;;
23940 +       XEN_CLEAR_PSR_IC
23941 +       ;;
23942 +       XEN_HYPER_SET_TPR
23943 +       ;;
23944 +       XEN_RESTORE_PSR_IC
23945 +       ;;
23946 +       br.ret.sptk.many rp
23947 +       ;;
23948 +END(xen_set_tpr)
23949 +
23950 +GLOBAL_ENTRY(xen_eoi)
23951 +       movl r8=running_on_xen;;
23952 +       ld4 r8=[r8];;
23953 +       cmp.eq p7,p0=r8,r0;;
23954 +(p7)   mov cr.eoi=r0;;
23955 +(p7)   br.ret.sptk.many rp
23956 +       ;;
23957 +       mov r8=r32
23958 +       ;;
23959 +       XEN_CLEAR_PSR_IC
23960 +       ;;
23961 +       XEN_HYPER_EOI
23962 +       ;;
23963 +       XEN_RESTORE_PSR_IC
23964 +       ;;
23965 +       br.ret.sptk.many rp
23966 +       ;;
23967 +END(xen_eoi)
23968 +
23969 +GLOBAL_ENTRY(xen_thash)
23970 +       movl r8=running_on_xen;;
23971 +       ld4 r8=[r8];;
23972 +       cmp.eq p7,p0=r8,r0;;
23973 +(p7)   thash r8=r32;;
23974 +(p7)   br.ret.sptk.many rp
23975 +       ;;
23976 +       mov r8=r32
23977 +       ;;
23978 +       XEN_CLEAR_PSR_IC
23979 +       ;;
23980 +       XEN_HYPER_THASH
23981 +       ;;
23982 +       XEN_RESTORE_PSR_IC
23983 +       ;;
23984 +       br.ret.sptk.many rp
23985 +       ;;
23986 +END(xen_thash)
23987 +
23988 +GLOBAL_ENTRY(xen_set_itm)
23989 +       movl r8=running_on_xen;;
23990 +       ld4 r8=[r8];;
23991 +       cmp.eq p7,p0=r8,r0;;
23992 +(p7)   mov cr.itm=r32;;
23993 +(p7)   br.ret.sptk.many rp
23994 +       ;;
23995 +       mov r8=r32
23996 +       ;;
23997 +       XEN_CLEAR_PSR_IC
23998 +       ;;
23999 +       XEN_HYPER_SET_ITM
24000 +       ;;
24001 +       XEN_RESTORE_PSR_IC
24002 +       ;;
24003 +       br.ret.sptk.many rp
24004 +       ;;
24005 +END(xen_set_itm)
24006 +
24007 +GLOBAL_ENTRY(xen_ptcga)
24008 +       movl r8=running_on_xen;;
24009 +       ld4 r8=[r8];;
24010 +       cmp.eq p7,p0=r8,r0;;
24011 +(p7)   ptc.ga r32,r33;;
24012 +(p7)   br.ret.sptk.many rp
24013 +       ;;
24014 +       mov r8=r32
24015 +       mov r9=r33
24016 +       ;;
24017 +       XEN_CLEAR_PSR_IC
24018 +       ;;
24019 +       XEN_HYPER_PTC_GA
24020 +       ;;
24021 +       XEN_RESTORE_PSR_IC
24022 +       ;;
24023 +       br.ret.sptk.many rp
24024 +       ;;
24025 +END(xen_ptcga)
24026 +
24027 +GLOBAL_ENTRY(xen_get_rr)
24028 +       movl r8=running_on_xen;;
24029 +       ld4 r8=[r8];;
24030 +       cmp.eq p7,p0=r8,r0;;
24031 +(p7)   mov r8=rr[r32];;
24032 +(p7)   br.ret.sptk.many rp
24033 +       ;;
24034 +       mov r8=r32
24035 +       ;;
24036 +       XEN_CLEAR_PSR_IC
24037 +       ;;
24038 +       XEN_HYPER_GET_RR
24039 +       ;;
24040 +       XEN_RESTORE_PSR_IC
24041 +       ;;
24042 +       br.ret.sptk.many rp
24043 +       ;;
24044 +END(xen_get_rr)
24045 +
24046 +GLOBAL_ENTRY(xen_set_rr)
24047 +       movl r8=running_on_xen;;
24048 +       ld4 r8=[r8];;
24049 +       cmp.eq p7,p0=r8,r0;;
24050 +(p7)   mov rr[r32]=r33;;
24051 +(p7)   br.ret.sptk.many rp
24052 +       ;;
24053 +       mov r8=r32
24054 +       mov r9=r33
24055 +       ;;
24056 +       XEN_CLEAR_PSR_IC
24057 +       ;;
24058 +       XEN_HYPER_SET_RR
24059 +       ;;
24060 +       XEN_RESTORE_PSR_IC
24061 +       ;;
24062 +       br.ret.sptk.many rp
24063 +       ;;
24064 +END(xen_set_rr)
24065 +
24066 +GLOBAL_ENTRY(xen_set_kr)
24067 +       movl r8=running_on_xen;;
24068 +       ld4 r8=[r8];;
24069 +       cmp.ne p7,p0=r8,r0;;
24070 +(p7)   br.cond.spnt.few 1f;
24071 +       ;;
24072 +       cmp.eq p7,p0=r8,r0
24073 +       adds r8=-1,r8;;
24074 +(p7)   mov ar0=r9
24075 +(p7)   br.ret.sptk.many rp;;
24076 +       cmp.eq p7,p0=r8,r0
24077 +       adds r8=-1,r8;;
24078 +(p7)   mov ar1=r9
24079 +(p7)   br.ret.sptk.many rp;;
24080 +       cmp.eq p7,p0=r8,r0
24081 +       adds r8=-1,r8;;
24082 +(p7)   mov ar2=r9
24083 +(p7)   br.ret.sptk.many rp;;
24084 +       cmp.eq p7,p0=r8,r0
24085 +       adds r8=-1,r8;;
24086 +(p7)   mov ar3=r9
24087 +(p7)   br.ret.sptk.many rp;;
24088 +       cmp.eq p7,p0=r8,r0
24089 +       adds r8=-1,r8;;
24090 +(p7)   mov ar4=r9
24091 +(p7)   br.ret.sptk.many rp;;
24092 +       cmp.eq p7,p0=r8,r0
24093 +       adds r8=-1,r8;;
24094 +(p7)   mov ar5=r9
24095 +(p7)   br.ret.sptk.many rp;;
24096 +       cmp.eq p7,p0=r8,r0
24097 +       adds r8=-1,r8;;
24098 +(p7)   mov ar6=r9
24099 +(p7)   br.ret.sptk.many rp;;
24100 +       cmp.eq p7,p0=r8,r0
24101 +       adds r8=-1,r8;;
24102 +(p7)   mov ar7=r9
24103 +(p7)   br.ret.sptk.many rp;;
24104 +
24105 +1:     mov r8=r32
24106 +       mov r9=r33
24107 +       ;;
24108 +       XEN_CLEAR_PSR_IC
24109 +       ;;
24110 +       XEN_HYPER_SET_KR
24111 +       ;;
24112 +       XEN_RESTORE_PSR_IC
24113 +       ;;
24114 +       br.ret.sptk.many rp
24115 +END(xen_set_kr)
24116 +
24117 +GLOBAL_ENTRY(xen_fc)
24118 +       movl r8=running_on_xen;;
24119 +       ld4 r8=[r8];;
24120 +       cmp.eq p7,p0=r8,r0;;
24121 +(p7)   fc r32;;
24122 +(p7)   br.ret.sptk.many rp
24123 +       ;;
24124 +       mov r8=r32
24125 +       ;;
24126 +       XEN_CLEAR_PSR_IC
24127 +       ;;
24128 +       XEN_HYPER_FC
24129 +       ;;
24130 +       XEN_RESTORE_PSR_IC
24131 +       ;;
24132 +       br.ret.sptk.many rp
24133 +END(xen_fc)
24134 +
24135 +GLOBAL_ENTRY(xen_get_cpuid)
24136 +       movl r8=running_on_xen;;
24137 +       ld4 r8=[r8];;
24138 +       cmp.eq p7,p0=r8,r0;;
24139 +(p7)   mov r8=cpuid[r32];;
24140 +(p7)   br.ret.sptk.many rp
24141 +       ;;
24142 +       mov r8=r32
24143 +       ;;
24144 +       XEN_CLEAR_PSR_IC
24145 +       ;;
24146 +       XEN_HYPER_GET_CPUID
24147 +       ;;
24148 +       XEN_RESTORE_PSR_IC
24149 +       ;;
24150 +       br.ret.sptk.many rp
24151 +END(xen_get_cpuid)
24152 +
24153 +GLOBAL_ENTRY(xen_get_pmd)
24154 +       movl r8=running_on_xen;;
24155 +       ld4 r8=[r8];;
24156 +       cmp.eq p7,p0=r8,r0;;
24157 +(p7)   mov r8=pmd[r32];;
24158 +(p7)   br.ret.sptk.many rp
24159 +       ;;
24160 +       mov r8=r32
24161 +       ;;
24162 +       XEN_CLEAR_PSR_IC
24163 +       ;;
24164 +       XEN_HYPER_GET_PMD
24165 +       ;;
24166 +       XEN_RESTORE_PSR_IC
24167 +       ;;
24168 +       br.ret.sptk.many rp
24169 +END(xen_get_pmd)
24170 +
24171 +#ifdef CONFIG_IA32_SUPPORT
24172 +GLOBAL_ENTRY(xen_get_eflag)
24173 +       movl r8=running_on_xen;;
24174 +       ld4 r8=[r8];;
24175 +       cmp.eq p7,p0=r8,r0;;
24176 +(p7)   mov r8=ar24;;
24177 +(p7)   br.ret.sptk.many rp
24178 +       ;;
24179 +       mov r8=r32
24180 +       ;;
24181 +       XEN_CLEAR_PSR_IC
24182 +       ;;
24183 +       XEN_HYPER_GET_EFLAG
24184 +       ;;
24185 +       XEN_RESTORE_PSR_IC
24186 +       ;;
24187 +       br.ret.sptk.many rp
24188 +END(xen_get_eflag)
24189 +       
24190 +// some bits aren't set if pl!=0, see SDM vol1 3.1.8
24191 +GLOBAL_ENTRY(xen_set_eflag)
24192 +       movl r8=running_on_xen;;
24193 +       ld4 r8=[r8];;
24194 +       cmp.eq p7,p0=r8,r0;;
24195 +(p7)   mov ar24=r32
24196 +(p7)   br.ret.sptk.many rp
24197 +       ;;
24198 +       mov r8=r32
24199 +       ;;
24200 +       XEN_CLEAR_PSR_IC
24201 +       ;;
24202 +       XEN_HYPER_SET_EFLAG
24203 +       ;;
24204 +       XEN_RESTORE_PSR_IC
24205 +       ;;
24206 +       br.ret.sptk.many rp
24207 +END(xen_set_eflag)
24208 +#endif
24209 +
24210 +GLOBAL_ENTRY(xen_send_ipi)
24211 +        mov r14=r32
24212 +        mov r15=r33
24213 +        mov r2=0x400
24214 +        break 0x1000
24215 +        ;;
24216 +        br.ret.sptk.many rp
24217 +        ;;
24218 +END(xen_send_ipi)
24219 +
24220 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
24221 +// Those are vdso specialized.
24222 +// In fsys mode, call, ret can't be used.
24223 +GLOBAL_ENTRY(xen_rsm_be_i)
24224 +       st1 [r22]=r20
24225 +       st4 [r23]=r0
24226 +       XEN_HYPER_RSM_BE
24227 +       st4 [r23]=r20
24228 +       brl.cond.sptk   .vdso_rsm_be_i_ret
24229 +       ;; 
24230 +END(xen_rsm_be_i)
24231 +
24232 +GLOBAL_ENTRY(xen_get_psr)
24233 +       mov r31=r8
24234 +       mov r25=IA64_PSR_IC
24235 +       st4 [r23]=r0
24236 +       XEN_HYPER_GET_PSR
24237 +       ;; 
24238 +       st4 [r23]=r20
24239 +       or r29=r8,r25 // vpsr.ic was cleared for hyperprivop
24240 +       mov r8=r31
24241 +       brl.cond.sptk   .vdso_get_psr_ret
24242 +       ;; 
24243 +END(xen_get_psr)
24244 +
24245 +       // see xen_ssm_i() in privop.h
24246 +       // r22 = &vcpu->vcpu_info->evtchn_upcall_mask
24247 +       // r23 = &vpsr.ic
24248 +       // r24 = &vcpu->vcpu_info->evtchn_upcall_pending
24249 +       // r25 = tmp
24250 +       // r31 = tmp
24251 +       // p11 = tmp
24252 +       // p14 = tmp
24253 +#define XEN_SET_PSR_I                  \
24254 +       ld1 r31=[r22];                  \
24255 +       ld1 r25=[r24];                  \
24256 +       ;;                              \
24257 +       st1 [r22]=r0;                   \
24258 +       cmp.ne.unc p14,p0=r0,r31;       \
24259 +       ;;                              \
24260 +(p14)  cmp.ne.unc p11,p0=r0,r25;       \
24261 +       ;;                              \
24262 +(p11)  st1 [r22]=r20;                  \
24263 +(p11)  st4 [r23]=r0;                   \
24264 +(p11)  XEN_HYPER_SSM_I;
24265 +               
24266 +GLOBAL_ENTRY(xen_ssm_i_0)
24267 +       XEN_SET_PSR_I
24268 +       brl.cond.sptk   .vdso_ssm_i_0_ret
24269 +       ;; 
24270 +END(xen_ssm_i_0)
24271 +
24272 +GLOBAL_ENTRY(xen_ssm_i_1)
24273 +       XEN_SET_PSR_I
24274 +       brl.cond.sptk   .vdso_ssm_i_1_ret
24275 +       ;; 
24276 +END(xen_ssm_i_1)
24277 +#endif
24278 diff -ruNp linux-2.6.19/arch/ia64/xen/hypervisor.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/hypervisor.c
24279 --- linux-2.6.19/arch/ia64/xen/hypervisor.c     1970-01-01 00:00:00.000000000 +0000
24280 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/hypervisor.c   2007-02-02 19:10:21.000000000 +0000
24281 @@ -0,0 +1,1104 @@
24282 +/******************************************************************************
24283 + * include/asm-ia64/shadow.h
24284 + *
24285 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
24286 + *                    VA Linux Systems Japan K.K.
24287 + *
24288 + * This program is free software; you can redistribute it and/or modify
24289 + * it under the terms of the GNU General Public License as published by
24290 + * the Free Software Foundation; either version 2 of the License, or
24291 + * (at your option) any later version.
24292 + *
24293 + * This program is distributed in the hope that it will be useful,
24294 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
24295 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24296 + * GNU General Public License for more details.
24297 + *
24298 + * You should have received a copy of the GNU General Public License
24299 + * along with this program; if not, write to the Free Software
24300 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24301 + *
24302 + */
24303 +
24304 +//#include <linux/kernel.h>
24305 +#include <linux/spinlock.h>
24306 +#include <linux/bootmem.h>
24307 +#include <linux/module.h>
24308 +#include <linux/vmalloc.h>
24309 +#include <asm/page.h>
24310 +#include <asm/hypervisor.h>
24311 +#include <asm/hypercall.h>
24312 +#include <xen/interface/memory.h>
24313 +#include <xen/balloon.h>
24314 +
24315 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)XSI_BASE;
24316 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
24317 +
24318 +start_info_t *xen_start_info;
24319 +EXPORT_SYMBOL(xen_start_info);
24320 +
24321 +int running_on_xen;
24322 +EXPORT_SYMBOL(running_on_xen);
24323 +
24324 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
24325 +static int p2m_expose_init(void);
24326 +#else
24327 +#define p2m_expose_init() (-ENOSYS)
24328 +#endif
24329 +
24330 +//XXX same as i386, x86_64 contiguous_bitmap_set(), contiguous_bitmap_clear()
24331 +// move those to lib/contiguous_bitmap?
24332 +//XXX discontigmem/sparsemem
24333 +
24334 +/*
24335 + * Bitmap is indexed by page number. If bit is set, the page is part of a
24336 + * xen_create_contiguous_region() area of memory.
24337 + */
24338 +unsigned long *contiguous_bitmap;
24339 +
24340 +void
24341 +contiguous_bitmap_init(unsigned long end_pfn)
24342 +{
24343 +       unsigned long size = (end_pfn + 2 * BITS_PER_LONG) >> 3;
24344 +       contiguous_bitmap = alloc_bootmem_low_pages(size);
24345 +       BUG_ON(!contiguous_bitmap);
24346 +       memset(contiguous_bitmap, 0, size);
24347 +}
24348 +
24349 +#if 0
24350 +int
24351 +contiguous_bitmap_test(void* p)
24352 +{
24353 +       return test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap);
24354 +}
24355 +#endif
24356 +
24357 +static void contiguous_bitmap_set(
24358 +       unsigned long first_page, unsigned long nr_pages)
24359 +{
24360 +       unsigned long start_off, end_off, curr_idx, end_idx;
24361 +
24362 +       curr_idx  = first_page / BITS_PER_LONG;
24363 +       start_off = first_page & (BITS_PER_LONG-1);
24364 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
24365 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
24366 +
24367 +       if (curr_idx == end_idx) {
24368 +               contiguous_bitmap[curr_idx] |=
24369 +                       ((1UL<<end_off)-1) & -(1UL<<start_off);
24370 +       } else {
24371 +               contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
24372 +               while ( ++curr_idx < end_idx )
24373 +                       contiguous_bitmap[curr_idx] = ~0UL;
24374 +               contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
24375 +       }
24376 +}
24377 +
24378 +static void contiguous_bitmap_clear(
24379 +       unsigned long first_page, unsigned long nr_pages)
24380 +{
24381 +       unsigned long start_off, end_off, curr_idx, end_idx;
24382 +
24383 +       curr_idx  = first_page / BITS_PER_LONG;
24384 +       start_off = first_page & (BITS_PER_LONG-1);
24385 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
24386 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
24387 +
24388 +       if (curr_idx == end_idx) {
24389 +               contiguous_bitmap[curr_idx] &=
24390 +                       -(1UL<<end_off) | ((1UL<<start_off)-1);
24391 +       } else {
24392 +               contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
24393 +               while ( ++curr_idx != end_idx )
24394 +                       contiguous_bitmap[curr_idx] = 0;
24395 +               contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
24396 +       }
24397 +}
24398 +
24399 +// __xen_create_contiguous_region(), __xen_destroy_contiguous_region()
24400 +// are based on i386 xen_create_contiguous_region(),
24401 +// xen_destroy_contiguous_region()
24402 +
24403 +/* Protected by balloon_lock. */
24404 +#define MAX_CONTIG_ORDER 7
24405 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
24406 +
24407 +/* Ensure multi-page extents are contiguous in machine memory. */
24408 +int
24409 +__xen_create_contiguous_region(unsigned long vstart,
24410 +                              unsigned int order, unsigned int address_bits)
24411 +{
24412 +       unsigned long error = 0;
24413 +       unsigned long gphys = __pa(vstart);
24414 +       unsigned long start_gpfn = gphys >> PAGE_SHIFT;
24415 +       unsigned long num_gpfn = 1 << order;
24416 +       unsigned long i;
24417 +       unsigned long flags;
24418 +
24419 +       unsigned long *in_frames = discontig_frames, out_frame;
24420 +       int success;
24421 +       struct xen_memory_exchange exchange = {
24422 +               .in = {
24423 +                       .nr_extents   = num_gpfn,
24424 +                       .extent_order = 0,
24425 +                       .domid        = DOMID_SELF
24426 +               },
24427 +               .out = {
24428 +                        .nr_extents   = 1,
24429 +                        .extent_order = order,
24430 +                        .address_bits = address_bits,
24431 +                        .domid        = DOMID_SELF
24432 +                },
24433 +               .nr_exchanged = 0
24434 +       };
24435 +
24436 +       if (unlikely(order > MAX_CONTIG_ORDER))
24437 +               return -ENOMEM;
24438 +       
24439 +       set_xen_guest_handle(exchange.in.extent_start, in_frames);
24440 +       set_xen_guest_handle(exchange.out.extent_start, &out_frame);
24441 +
24442 +       scrub_pages(vstart, num_gpfn);
24443 +
24444 +       balloon_lock(flags);
24445 +
24446 +       /* Get a new contiguous memory extent. */
24447 +       for (i = 0; i < num_gpfn; i++) {
24448 +               in_frames[i] = start_gpfn + i;
24449 +       }
24450 +       out_frame = start_gpfn;
24451 +       error = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
24452 +       success = (exchange.nr_exchanged == num_gpfn);
24453 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (error == 0)));
24454 +       BUG_ON(success && (error != 0));
24455 +       if (unlikely(error == -ENOSYS)) {
24456 +               /* Compatibility when XENMEM_exchange is unsupported. */
24457 +               error = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
24458 +                                            &exchange.in);
24459 +               BUG_ON(error != num_gpfn);
24460 +               error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24461 +                                            &exchange.out);
24462 +               if (error != 1) {
24463 +                       /* Couldn't get special memory: fall back to normal. */
24464 +                       for (i = 0; i < num_gpfn; i++) {
24465 +                               in_frames[i] = start_gpfn + i;
24466 +                       }
24467 +                       error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24468 +                                                    &exchange.in);
24469 +                       BUG_ON(error != num_gpfn);
24470 +                       success = 0;
24471 +               } else
24472 +                       success = 1;
24473 +       }
24474 +       if (success)
24475 +               contiguous_bitmap_set(start_gpfn, num_gpfn);
24476 +#if 0
24477 +       if (success) {
24478 +               unsigned long mfn;
24479 +               unsigned long mfn_prev = ~0UL;
24480 +               for (i = 0; i < num_gpfn; i++) {
24481 +                       mfn = pfn_to_mfn_for_dma(start_gpfn + i);
24482 +                       if (mfn_prev != ~0UL && mfn != mfn_prev + 1) {
24483 +                               xprintk("\n");
24484 +                               xprintk("%s:%d order %d "
24485 +                                       "start 0x%lx bus 0x%lx "
24486 +                                       "machine 0x%lx\n",
24487 +                                       __func__, __LINE__, order,
24488 +                                       vstart, virt_to_bus((void*)vstart),
24489 +                                       phys_to_machine_for_dma(gphys));
24490 +                               xprintk("mfn: ");
24491 +                               for (i = 0; i < num_gpfn; i++) {
24492 +                                       mfn = pfn_to_mfn_for_dma(
24493 +                                               start_gpfn + i);
24494 +                                       xprintk("0x%lx ", mfn);
24495 +                               }
24496 +                               xprintk("\n");
24497 +                               break;
24498 +                       }
24499 +                       mfn_prev = mfn;
24500 +               }
24501 +       }
24502 +#endif
24503 +       balloon_unlock(flags);
24504 +       return success? 0: -ENOMEM;
24505 +}
24506 +
24507 +void
24508 +__xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
24509 +{
24510 +       unsigned long flags;
24511 +       unsigned long error = 0;
24512 +       unsigned long start_gpfn = __pa(vstart) >> PAGE_SHIFT;
24513 +       unsigned long num_gpfn = 1UL << order;
24514 +       unsigned long i;
24515 +
24516 +       unsigned long *out_frames = discontig_frames, in_frame;
24517 +       int            success;
24518 +       struct xen_memory_exchange exchange = {
24519 +               .in = {
24520 +                       .nr_extents   = 1,
24521 +                       .extent_order = order,
24522 +                       .domid        = DOMID_SELF
24523 +               },
24524 +               .out = {
24525 +                        .nr_extents   = num_gpfn,
24526 +                        .extent_order = 0,
24527 +                        .address_bits = 0,
24528 +                        .domid        = DOMID_SELF
24529 +                },
24530 +               .nr_exchanged = 0
24531 +        };
24532 +       
24533 +
24534 +       if (!test_bit(start_gpfn, contiguous_bitmap))
24535 +               return;
24536 +
24537 +       if (unlikely(order > MAX_CONTIG_ORDER))
24538 +               return;
24539 +
24540 +       set_xen_guest_handle(exchange.in.extent_start, &in_frame);
24541 +       set_xen_guest_handle(exchange.out.extent_start, out_frames);
24542 +
24543 +       scrub_pages(vstart, num_gpfn);
24544 +
24545 +       balloon_lock(flags);
24546 +
24547 +       contiguous_bitmap_clear(start_gpfn, num_gpfn);
24548 +
24549 +        /* Do the exchange for non-contiguous MFNs. */
24550 +       in_frame = start_gpfn;
24551 +       for (i = 0; i < num_gpfn; i++) {
24552 +               out_frames[i] = start_gpfn + i;
24553 +       }
24554 +       error = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
24555 +       success = (exchange.nr_exchanged == 1);
24556 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (error == 0)));
24557 +       BUG_ON(success && (error != 0));
24558 +       if (unlikely(error == -ENOSYS)) {
24559 +                /* Compatibility when XENMEM_exchange is unsupported. */
24560 +               error = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
24561 +                                            &exchange.in);
24562 +               BUG_ON(error != 1);
24563 +
24564 +               error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24565 +                                            &exchange.out);
24566 +               BUG_ON(error != num_gpfn);
24567 +       }
24568 +       balloon_unlock(flags);
24569 +}
24570 +
24571 +
24572 +///////////////////////////////////////////////////////////////////////////
24573 +// grant table hack
24574 +// cmd: GNTTABOP_xxx
24575 +
24576 +#include <linux/mm.h>
24577 +#include <xen/interface/xen.h>
24578 +#include <xen/gnttab.h>
24579 +
24580 +static void
24581 +gnttab_map_grant_ref_pre(struct gnttab_map_grant_ref *uop)
24582 +{
24583 +       uint32_t flags;
24584 +
24585 +       flags = uop->flags;
24586 +
24587 +       if (flags & GNTMAP_host_map) {
24588 +               if (flags & GNTMAP_application_map) {
24589 +                       xprintd("GNTMAP_application_map is not supported yet: flags 0x%x\n", flags);
24590 +                       BUG();
24591 +               }
24592 +               if (flags & GNTMAP_contains_pte) {
24593 +                       xprintd("GNTMAP_contains_pte is not supported yet flags 0x%x\n", flags);
24594 +                       BUG();
24595 +               }
24596 +       } else if (flags & GNTMAP_device_map) {
24597 +               xprintd("GNTMAP_device_map is not supported yet 0x%x\n", flags);
24598 +               BUG();//XXX not yet. actually this flag is not used.
24599 +       } else {
24600 +               BUG();
24601 +       }
24602 +}
24603 +
24604 +int
24605 +HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
24606 +{
24607 +       if (cmd == GNTTABOP_map_grant_ref) {
24608 +               unsigned int i;
24609 +               for (i = 0; i < count; i++) {
24610 +                       gnttab_map_grant_ref_pre(
24611 +                               (struct gnttab_map_grant_ref*)uop + i);
24612 +               }
24613 +       }
24614 +       return xencomm_mini_hypercall_grant_table_op(cmd, uop, count);
24615 +}
24616 +EXPORT_SYMBOL(HYPERVISOR_grant_table_op);
24617 +
24618 +///////////////////////////////////////////////////////////////////////////
24619 +// PageForeign(), SetPageForeign(), ClearPageForeign()
24620 +
24621 +struct address_space xen_ia64_foreign_dummy_mapping;
24622 +EXPORT_SYMBOL(xen_ia64_foreign_dummy_mapping);
24623 +
24624 +///////////////////////////////////////////////////////////////////////////
24625 +// foreign mapping
24626 +#include <linux/efi.h>
24627 +#include <asm/meminit.h> // for IA64_GRANULE_SIZE, GRANULEROUND{UP,DOWN}()
24628 +
24629 +static unsigned long privcmd_resource_min = 0;
24630 +// Xen/ia64 currently can handle pseudo physical address bits up to
24631 +// (PAGE_SHIFT * 3)
24632 +static unsigned long privcmd_resource_max = GRANULEROUNDDOWN((1UL << (PAGE_SHIFT * 3)) - 1);
24633 +static unsigned long privcmd_resource_align = IA64_GRANULE_SIZE;
24634 +
24635 +static unsigned long
24636 +md_end_addr(const efi_memory_desc_t *md)
24637 +{
24638 +       return md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
24639 +}
24640 +
24641 +#define XEN_IA64_PRIVCMD_LEAST_GAP_SIZE        (1024 * 1024 * 1024UL)
24642 +static int
24643 +xen_ia64_privcmd_check_size(unsigned long start, unsigned long end)
24644 +{
24645 +       return (start < end &&
24646 +               (end - start) > XEN_IA64_PRIVCMD_LEAST_GAP_SIZE);
24647 +}
24648 +
24649 +static int __init
24650 +xen_ia64_privcmd_init(void)
24651 +{
24652 +       void *efi_map_start, *efi_map_end, *p;
24653 +       u64 efi_desc_size;
24654 +       efi_memory_desc_t *md;
24655 +       unsigned long tmp_min;
24656 +       unsigned long tmp_max;
24657 +       unsigned long gap_size;
24658 +       unsigned long prev_end;
24659 +
24660 +       if (!is_running_on_xen())
24661 +               return -1;
24662 +
24663 +       efi_map_start = __va(ia64_boot_param->efi_memmap);
24664 +       efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
24665 +       efi_desc_size = ia64_boot_param->efi_memdesc_size;
24666 +
24667 +       // at first check the used highest address
24668 +       for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
24669 +               // nothing
24670 +       }
24671 +       md = p - efi_desc_size;
24672 +       privcmd_resource_min = GRANULEROUNDUP(md_end_addr(md));
24673 +       if (xen_ia64_privcmd_check_size(privcmd_resource_min,
24674 +                                       privcmd_resource_max)) {
24675 +               goto out;
24676 +       }
24677 +
24678 +       // the used highest address is too large. try to find the largest gap.
24679 +       tmp_min = privcmd_resource_max;
24680 +       tmp_max = 0;
24681 +       gap_size = 0;
24682 +       prev_end = 0;
24683 +       for (p = efi_map_start;
24684 +            p < efi_map_end - efi_desc_size;
24685 +            p += efi_desc_size) {
24686 +               unsigned long end;
24687 +               efi_memory_desc_t* next;
24688 +               unsigned long next_start;
24689 +
24690 +               md = p;
24691 +               end = md_end_addr(md);
24692 +               if (end > privcmd_resource_max) {
24693 +                       break;
24694 +               }
24695 +               if (end < prev_end) {
24696 +                       // work around. 
24697 +                       // Xen may pass incompletely sorted memory
24698 +                       // descriptors like
24699 +                       // [x, x + length]
24700 +                       // [x, x]
24701 +                       // this order should be reversed.
24702 +                       continue;
24703 +               }
24704 +               next = p + efi_desc_size;
24705 +               next_start = next->phys_addr;
24706 +               if (next_start > privcmd_resource_max) {
24707 +                       next_start = privcmd_resource_max;
24708 +               }
24709 +               if (end < next_start && gap_size < (next_start - end)) {
24710 +                       tmp_min = end;
24711 +                       tmp_max = next_start;
24712 +                       gap_size = tmp_max - tmp_min;
24713 +               }
24714 +               prev_end = end;
24715 +       }
24716 +
24717 +       privcmd_resource_min = GRANULEROUNDUP(tmp_min);
24718 +       if (xen_ia64_privcmd_check_size(privcmd_resource_min, tmp_max)) {
24719 +               privcmd_resource_max = tmp_max;
24720 +               goto out;
24721 +       }
24722 +
24723 +       privcmd_resource_min = tmp_min;
24724 +       privcmd_resource_max = tmp_max;
24725 +       if (!xen_ia64_privcmd_check_size(privcmd_resource_min,
24726 +                                        privcmd_resource_max)) {
24727 +               // Any large enough gap isn't found.
24728 +               // go ahead anyway with the warning hoping that large region
24729 +               // won't be requested.
24730 +               printk(KERN_WARNING "xen privcmd: large enough region for privcmd mmap is not found.\n");
24731 +       }
24732 +
24733 +out:
24734 +       printk(KERN_INFO "xen privcmd uses pseudo physical addr range [0x%lx, 0x%lx] (%ldMB)\n",
24735 +              privcmd_resource_min, privcmd_resource_max, 
24736 +              (privcmd_resource_max - privcmd_resource_min) >> 20);
24737 +       BUG_ON(privcmd_resource_min >= privcmd_resource_max);
24738 +
24739 +       // XXX this should be somewhere appropriate
24740 +       (void)p2m_expose_init();
24741 +
24742 +       return 0;
24743 +}
24744 +late_initcall(xen_ia64_privcmd_init);
24745 +
24746 +struct xen_ia64_privcmd_entry {
24747 +       atomic_t        map_count;
24748 +#define INVALID_GPFN   (~0UL)
24749 +       unsigned long   gpfn;
24750 +};
24751 +
24752 +struct xen_ia64_privcmd_range {
24753 +       atomic_t                        ref_count;
24754 +       unsigned long                   pgoff; // in PAGE_SIZE
24755 +       struct resource*                res;
24756 +
24757 +       unsigned long                   num_entries;
24758 +       struct xen_ia64_privcmd_entry   entries[0];
24759 +};
24760 +
24761 +struct xen_ia64_privcmd_vma {
24762 +       int                             is_privcmd_mmapped;
24763 +       struct xen_ia64_privcmd_range*  range;
24764 +
24765 +       unsigned long                   num_entries;
24766 +       struct xen_ia64_privcmd_entry*  entries;
24767 +};
24768 +
24769 +static void
24770 +xen_ia64_privcmd_init_entry(struct xen_ia64_privcmd_entry* entry)
24771 +{
24772 +       atomic_set(&entry->map_count, 0);
24773 +       entry->gpfn = INVALID_GPFN;
24774 +}
24775 +
24776 +static int
24777 +xen_ia64_privcmd_entry_mmap(struct vm_area_struct* vma,
24778 +                           unsigned long addr,
24779 +                           struct xen_ia64_privcmd_range* privcmd_range,
24780 +                           int i,
24781 +                           unsigned long gmfn,
24782 +                           pgprot_t prot,
24783 +                           domid_t domid)
24784 +{
24785 +       int error = 0;
24786 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24787 +       unsigned long gpfn;
24788 +       unsigned long flags;
24789 +
24790 +       if ((addr & ~PAGE_MASK) != 0 || gmfn == INVALID_MFN) {
24791 +               error = -EINVAL;
24792 +               goto out;
24793 +       }
24794 +
24795 +       if (entry->gpfn != INVALID_GPFN) {
24796 +               error = -EBUSY;
24797 +               goto out;
24798 +       }
24799 +       gpfn = (privcmd_range->res->start >> PAGE_SHIFT) + i;
24800 +
24801 +       flags = ASSIGN_writable;
24802 +       if (pgprot_val(prot) == PROT_READ) {
24803 +               flags = ASSIGN_readonly;
24804 +       }
24805 +       error = HYPERVISOR_add_physmap_with_gmfn(gpfn, gmfn, flags, domid);
24806 +       if (error != 0) {
24807 +               goto out;
24808 +       }
24809 +
24810 +       prot = vma->vm_page_prot;
24811 +       error = remap_pfn_range(vma, addr, gpfn, 1 << PAGE_SHIFT, prot);
24812 +       if (error != 0) {
24813 +               error = HYPERVISOR_zap_physmap(gpfn, 0);
24814 +               if (error) {
24815 +                       BUG();//XXX
24816 +               }
24817 +       } else {
24818 +               atomic_inc(&entry->map_count);
24819 +               entry->gpfn = gpfn;
24820 +       }
24821 +
24822 +out:
24823 +       return error;
24824 +}
24825 +
24826 +static void
24827 +xen_ia64_privcmd_entry_munmap(struct xen_ia64_privcmd_range* privcmd_range,
24828 +                             int i)
24829 +{
24830 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24831 +       unsigned long gpfn = entry->gpfn;
24832 +       //gpfn = (privcmd_range->res->start >> PAGE_SHIFT) +
24833 +       //      (vma->vm_pgoff - privcmd_range->pgoff);
24834 +       int error;
24835 +
24836 +       error = HYPERVISOR_zap_physmap(gpfn, 0);
24837 +       if (error) {
24838 +               BUG();//XXX
24839 +       }
24840 +       entry->gpfn = INVALID_GPFN;
24841 +}
24842 +
24843 +static void
24844 +xen_ia64_privcmd_entry_open(struct xen_ia64_privcmd_range* privcmd_range,
24845 +                           int i)
24846 +{
24847 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24848 +       if (entry->gpfn != INVALID_GPFN) {
24849 +               atomic_inc(&entry->map_count);
24850 +       } else {
24851 +               BUG_ON(atomic_read(&entry->map_count) != 0);
24852 +       }
24853 +}
24854 +
24855 +static void
24856 +xen_ia64_privcmd_entry_close(struct xen_ia64_privcmd_range* privcmd_range,
24857 +                            int i)
24858 +{
24859 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24860 +       if (entry->gpfn != INVALID_GPFN &&
24861 +           atomic_dec_and_test(&entry->map_count)) {
24862 +               xen_ia64_privcmd_entry_munmap(privcmd_range, i);
24863 +       }
24864 +}
24865 +
24866 +static void xen_ia64_privcmd_vma_open(struct vm_area_struct* vma);
24867 +static void xen_ia64_privcmd_vma_close(struct vm_area_struct* vma);
24868 +
24869 +struct vm_operations_struct xen_ia64_privcmd_vm_ops = {
24870 +       .open = &xen_ia64_privcmd_vma_open,
24871 +       .close = &xen_ia64_privcmd_vma_close,
24872 +};
24873 +
24874 +static void
24875 +__xen_ia64_privcmd_vma_open(struct vm_area_struct* vma,
24876 +                           struct xen_ia64_privcmd_vma* privcmd_vma,
24877 +                           struct xen_ia64_privcmd_range* privcmd_range)
24878 +{
24879 +       unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24880 +       unsigned long num_entries = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
24881 +       unsigned long i;
24882 +
24883 +       BUG_ON(entry_offset < 0);
24884 +       BUG_ON(entry_offset + num_entries > privcmd_range->num_entries);
24885 +
24886 +       privcmd_vma->range = privcmd_range;
24887 +       privcmd_vma->num_entries = num_entries;
24888 +       privcmd_vma->entries = &privcmd_range->entries[entry_offset];
24889 +       vma->vm_private_data = privcmd_vma;
24890 +       for (i = 0; i < privcmd_vma->num_entries; i++) {
24891 +               xen_ia64_privcmd_entry_open(privcmd_range, entry_offset + i);
24892 +       }
24893 +
24894 +       vma->vm_private_data = privcmd_vma;
24895 +       vma->vm_ops = &xen_ia64_privcmd_vm_ops;
24896 +}
24897 +
24898 +static void
24899 +xen_ia64_privcmd_vma_open(struct vm_area_struct* vma)
24900 +{
24901 +       struct xen_ia64_privcmd_vma* old_privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24902 +       struct xen_ia64_privcmd_vma* privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24903 +       struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24904 +
24905 +       atomic_inc(&privcmd_range->ref_count);
24906 +       // vm_op->open() can't fail.
24907 +       privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL | __GFP_NOFAIL);
24908 +       // copy original value if necessary
24909 +       privcmd_vma->is_privcmd_mmapped = old_privcmd_vma->is_privcmd_mmapped;
24910 +
24911 +       __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
24912 +}
24913 +
24914 +static void
24915 +xen_ia64_privcmd_vma_close(struct vm_area_struct* vma)
24916 +{
24917 +       struct xen_ia64_privcmd_vma* privcmd_vma =
24918 +               (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24919 +       struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24920 +       unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24921 +       unsigned long i;
24922 +
24923 +       for (i = 0; i < privcmd_vma->num_entries; i++) {
24924 +               xen_ia64_privcmd_entry_close(privcmd_range, entry_offset + i);
24925 +       }
24926 +       vma->vm_private_data = NULL;
24927 +       kfree(privcmd_vma);
24928 +
24929 +       if (atomic_dec_and_test(&privcmd_range->ref_count)) {
24930 +#if 1
24931 +               for (i = 0; i < privcmd_range->num_entries; i++) {
24932 +                       struct xen_ia64_privcmd_entry* entry =
24933 +                               &privcmd_range->entries[i];
24934 +                       BUG_ON(atomic_read(&entry->map_count) != 0);
24935 +                       BUG_ON(entry->gpfn != INVALID_GPFN);
24936 +               }
24937 +#endif
24938 +               release_resource(privcmd_range->res);
24939 +               kfree(privcmd_range->res);
24940 +               vfree(privcmd_range);
24941 +       }
24942 +}
24943 +
24944 +int
24945 +privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
24946 +{
24947 +       struct xen_ia64_privcmd_vma* privcmd_vma =
24948 +               (struct xen_ia64_privcmd_vma *)vma->vm_private_data;
24949 +       return (xchg(&privcmd_vma->is_privcmd_mmapped, 1) == 0);
24950 +}
24951 +
24952 +int
24953 +privcmd_mmap(struct file * file, struct vm_area_struct * vma)
24954 +{
24955 +       int error;
24956 +       unsigned long size = vma->vm_end - vma->vm_start;
24957 +       unsigned long num_entries = size >> PAGE_SHIFT;
24958 +       struct xen_ia64_privcmd_range* privcmd_range = NULL;
24959 +       struct xen_ia64_privcmd_vma* privcmd_vma = NULL;
24960 +       struct resource* res = NULL;
24961 +       unsigned long i;
24962 +       BUG_ON(!is_running_on_xen());
24963 +
24964 +       BUG_ON(file->private_data != NULL);
24965 +
24966 +       error = -ENOMEM;
24967 +       privcmd_range =
24968 +               vmalloc(sizeof(*privcmd_range) +
24969 +                       sizeof(privcmd_range->entries[0]) * num_entries);
24970 +       if (privcmd_range == NULL) {
24971 +               goto out_enomem0;
24972 +       }
24973 +       privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL);
24974 +       if (privcmd_vma == NULL) {
24975 +               goto out_enomem1;
24976 +       }
24977 +       privcmd_vma->is_privcmd_mmapped = 0;
24978 +
24979 +       res = kzalloc(sizeof(*res), GFP_KERNEL);
24980 +       if (res == NULL) {
24981 +               goto out_enomem1;
24982 +       }
24983 +       res->name = "Xen privcmd mmap";
24984 +       error = allocate_resource(&iomem_resource, res, size,
24985 +                                 privcmd_resource_min, privcmd_resource_max,
24986 +                                 privcmd_resource_align, NULL, NULL);
24987 +       if (error) {
24988 +               goto out_enomem1;
24989 +       }
24990 +       privcmd_range->res = res;
24991 +
24992 +       /* DONTCOPY is essential for Xen as copy_page_range is broken. */
24993 +       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
24994 +
24995 +       atomic_set(&privcmd_range->ref_count, 1);
24996 +       privcmd_range->pgoff = vma->vm_pgoff;
24997 +       privcmd_range->num_entries = num_entries;
24998 +       for (i = 0; i < privcmd_range->num_entries; i++) {
24999 +               xen_ia64_privcmd_init_entry(&privcmd_range->entries[i]);
25000 +       }
25001 +
25002 +       __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
25003 +       return 0;
25004 +
25005 +out_enomem1:
25006 +       kfree(res);
25007 +       kfree(privcmd_vma);
25008 +out_enomem0:
25009 +       vfree(privcmd_range);
25010 +       return error;
25011 +}
25012 +
25013 +int
25014 +direct_remap_pfn_range(struct vm_area_struct *vma,
25015 +                      unsigned long address,   // process virtual address
25016 +                      unsigned long gmfn,      // gmfn, gmfn + 1, ... gmfn + size/PAGE_SIZE
25017 +                      unsigned long size,
25018 +                      pgprot_t prot,
25019 +                      domid_t  domid)          // target domain
25020 +{
25021 +       struct xen_ia64_privcmd_vma* privcmd_vma =
25022 +               (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
25023 +       struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
25024 +       unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
25025 +
25026 +       unsigned long i;
25027 +       unsigned long offset;
25028 +       int error = 0;
25029 +       BUG_ON(!is_running_on_xen());
25030 +
25031 +#if 0
25032 +       if (prot != vm->vm_page_prot) {
25033 +               return -EINVAL;
25034 +       }
25035 +#endif
25036 +
25037 +       i = (address - vma->vm_start) >> PAGE_SHIFT;
25038 +       for (offset = 0; offset < size; offset += PAGE_SIZE) {
25039 +               error = xen_ia64_privcmd_entry_mmap(vma, (address + offset) & PAGE_MASK, privcmd_range, entry_offset + i, gmfn, prot, domid);
25040 +               if (error != 0) {
25041 +                       break;
25042 +               }
25043 +
25044 +               i++;
25045 +               gmfn++;
25046 +        }
25047 +
25048 +       return error;
25049 +}
25050 +
25051 +
25052 +/* Called after suspend, to resume time.  */
25053 +void
25054 +time_resume(void)
25055 +{
25056 +       extern void ia64_cpu_local_tick(void);
25057 +
25058 +       /* Just trigger a tick.  */
25059 +       ia64_cpu_local_tick();
25060 +}
25061 +
25062 +///////////////////////////////////////////////////////////////////////////
25063 +// expose p2m table
25064 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
25065 +#include <linux/cpu.h>
25066 +#include <asm/uaccess.h>
25067 +
25068 +int p2m_initialized __read_mostly = 0;
25069 +
25070 +unsigned long p2m_min_low_pfn __read_mostly;
25071 +unsigned long p2m_max_low_pfn __read_mostly;
25072 +unsigned long p2m_convert_min_pfn __read_mostly;
25073 +unsigned long p2m_convert_max_pfn __read_mostly;
25074 +
25075 +static struct resource p2m_resource = {
25076 +       .name    = "Xen p2m table",
25077 +       .flags   = IORESOURCE_MEM,
25078 +};
25079 +static unsigned long p2m_assign_start_pfn __read_mostly;
25080 +static unsigned long p2m_assign_end_pfn __read_mostly;
25081 +volatile const pte_t* p2m_pte __read_mostly;
25082 +
25083 +#define GRNULE_PFN     PTRS_PER_PTE
25084 +static unsigned long p2m_granule_pfn __read_mostly = GRNULE_PFN;
25085 +
25086 +#define ROUNDDOWN(x, y)  ((x) & ~((y) - 1))
25087 +#define ROUNDUP(x, y)    (((x) + (y) - 1) & ~((y) - 1))
25088 +
25089 +#define P2M_PREFIX     "Xen p2m: "
25090 +
25091 +static int xen_ia64_p2m_expose __read_mostly = 1;
25092 +module_param(xen_ia64_p2m_expose, int, 0);
25093 +MODULE_PARM_DESC(xen_ia64_p2m_expose,
25094 +                 "enable/disable xen/ia64 p2m exposure optimization\n");
25095 +
25096 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
25097 +static int xen_ia64_p2m_expose_use_dtr __read_mostly = 1;
25098 +module_param(xen_ia64_p2m_expose_use_dtr, int, 0);
25099 +MODULE_PARM_DESC(xen_ia64_p2m_expose_use_dtr,
25100 +                 "use/unuse dtr to map exposed p2m table\n");
25101 +
25102 +static const int p2m_page_shifts[] = {
25103 +       _PAGE_SIZE_4K,
25104 +       _PAGE_SIZE_8K,
25105 +       _PAGE_SIZE_16K,
25106 +       _PAGE_SIZE_64K,
25107 +       _PAGE_SIZE_256K,
25108 +       _PAGE_SIZE_1M,
25109 +       _PAGE_SIZE_4M,
25110 +       _PAGE_SIZE_16M,
25111 +       _PAGE_SIZE_64M,
25112 +       _PAGE_SIZE_256M,
25113 +};
25114 +
25115 +struct p2m_itr_arg {
25116 +       unsigned long vaddr;
25117 +       unsigned long pteval;
25118 +       unsigned long log_page_size;
25119 +};
25120 +static struct p2m_itr_arg p2m_itr_arg __read_mostly;
25121 +
25122 +// This should be in asm-ia64/kregs.h
25123 +#define IA64_TR_P2M_TABLE      3
25124 +
25125 +static void
25126 +p2m_itr(void* info)
25127 +{
25128 +       struct p2m_itr_arg* arg = (struct p2m_itr_arg*)info;
25129 +       ia64_itr(0x2, IA64_TR_P2M_TABLE,
25130 +                arg->vaddr, arg->pteval, arg->log_page_size);
25131 +       ia64_srlz_d();
25132 +}
25133 +
25134 +static int
25135 +p2m_expose_dtr_call(struct notifier_block *self,
25136 +                    unsigned long event, void* ptr)
25137 +{
25138 +       unsigned int cpu = (unsigned int)(long)ptr;
25139 +       if (event != CPU_ONLINE)
25140 +               return 0;
25141 +       if (!(p2m_initialized && xen_ia64_p2m_expose_use_dtr))
25142 +               smp_call_function_single(cpu, &p2m_itr, &p2m_itr_arg, 1, 1);
25143 +       return 0;
25144 +}
25145 +
25146 +static struct notifier_block p2m_expose_dtr_hotplug_notifier = {
25147 +       .notifier_call = p2m_expose_dtr_call,
25148 +       .next          = NULL,
25149 +       .priority      = 0
25150 +};
25151 +#endif
25152 +
25153 +static int
25154 +p2m_expose_init(void)
25155 +{
25156 +       unsigned long num_pfn;
25157 +       unsigned long size = 0;
25158 +       unsigned long p2m_size = 0;
25159 +       unsigned long align = ~0UL;
25160 +       int error = 0;
25161 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
25162 +       int i;
25163 +       unsigned long page_size;
25164 +       unsigned long log_page_size = 0;
25165 +#endif
25166 +
25167 +       if (!xen_ia64_p2m_expose)
25168 +               return -ENOSYS;
25169 +       if (p2m_initialized)
25170 +               return 0;
25171 +
25172 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
25173 +       error = register_cpu_notifier(&p2m_expose_dtr_hotplug_notifier);
25174 +       if (error < 0)
25175 +               return error;
25176 +#endif
25177 +
25178 +       lock_cpu_hotplug();
25179 +       if (p2m_initialized)
25180 +               goto out;
25181 +
25182 +#ifdef CONFIG_DISCONTIGMEM
25183 +       p2m_min_low_pfn = min_low_pfn;
25184 +       p2m_max_low_pfn = max_low_pfn;
25185 +#else
25186 +       p2m_min_low_pfn = 0;
25187 +       p2m_max_low_pfn = max_pfn;
25188 +#endif
25189 +
25190 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
25191 +       if (xen_ia64_p2m_expose_use_dtr) {
25192 +               unsigned long granule_pfn = 0;
25193 +               p2m_size = p2m_max_low_pfn - p2m_min_low_pfn;
25194 +               for (i = 0;
25195 +                    i < sizeof(p2m_page_shifts)/sizeof(p2m_page_shifts[0]);
25196 +                    i++) {
25197 +                       log_page_size = p2m_page_shifts[i];
25198 +                       page_size = 1UL << log_page_size;
25199 +                       if (page_size < p2m_size)
25200 +                               continue;
25201 +
25202 +                       granule_pfn = max(page_size >> PAGE_SHIFT,
25203 +                                         p2m_granule_pfn);
25204 +                       p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn,
25205 +                                                       granule_pfn);
25206 +                       p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn,
25207 +                                                     granule_pfn);
25208 +                       num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn;
25209 +                       size = num_pfn << PAGE_SHIFT;
25210 +                       p2m_size = num_pfn / PTRS_PER_PTE;
25211 +                       p2m_size = ROUNDUP(p2m_size, granule_pfn << PAGE_SHIFT);
25212 +                       if (p2m_size == page_size)
25213 +                               break;
25214 +               }
25215 +               if (p2m_size != page_size) {
25216 +                       printk(KERN_ERR "p2m_size != page_size\n");
25217 +                       error = -EINVAL;
25218 +                       goto out;
25219 +               }
25220 +               align = max(privcmd_resource_align, granule_pfn << PAGE_SHIFT);
25221 +       } else
25222 +#endif
25223 +       {
25224 +               BUG_ON(p2m_granule_pfn & (p2m_granule_pfn - 1));
25225 +               p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn,
25226 +                                               p2m_granule_pfn);
25227 +               p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn, p2m_granule_pfn);
25228 +               num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn;
25229 +               size = num_pfn << PAGE_SHIFT;
25230 +               p2m_size = num_pfn / PTRS_PER_PTE;
25231 +               p2m_size = ROUNDUP(p2m_size, p2m_granule_pfn << PAGE_SHIFT);
25232 +               align = max(privcmd_resource_align,
25233 +                           p2m_granule_pfn << PAGE_SHIFT);
25234 +       }
25235 +       
25236 +       // use privcmd region
25237 +       error = allocate_resource(&iomem_resource, &p2m_resource, p2m_size,
25238 +                                 privcmd_resource_min, privcmd_resource_max,
25239 +                                 align, NULL, NULL);
25240 +       if (error) {
25241 +               printk(KERN_ERR P2M_PREFIX
25242 +                      "can't allocate region for p2m exposure "
25243 +                      "[0x%016lx, 0x%016lx) 0x%016lx\n",
25244 +                      p2m_convert_min_pfn, p2m_convert_max_pfn, p2m_size);
25245 +               goto out;
25246 +       }
25247 +
25248 +       p2m_assign_start_pfn = p2m_resource.start >> PAGE_SHIFT;
25249 +       p2m_assign_end_pfn = p2m_resource.end >> PAGE_SHIFT;
25250 +       
25251 +       error = HYPERVISOR_expose_p2m(p2m_convert_min_pfn,
25252 +                                     p2m_assign_start_pfn,
25253 +                                     size, p2m_granule_pfn);
25254 +       if (error) {
25255 +               printk(KERN_ERR P2M_PREFIX "failed expose p2m hypercall %d\n",
25256 +                      error);
25257 +               printk(KERN_ERR P2M_PREFIX "conv 0x%016lx assign 0x%016lx "
25258 +                      "size 0x%016lx granule 0x%016lx\n",
25259 +                      p2m_convert_min_pfn, p2m_assign_start_pfn,
25260 +                      size, p2m_granule_pfn);;
25261 +               release_resource(&p2m_resource);
25262 +               goto out;
25263 +       }
25264 +       p2m_pte = (volatile const pte_t*)pfn_to_kaddr(p2m_assign_start_pfn);
25265 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
25266 +       if (xen_ia64_p2m_expose_use_dtr) {
25267 +               p2m_itr_arg.vaddr = (unsigned long)__va(p2m_assign_start_pfn
25268 +                                                       << PAGE_SHIFT);
25269 +               p2m_itr_arg.pteval = pte_val(pfn_pte(p2m_assign_start_pfn,
25270 +                                                    PAGE_KERNEL));
25271 +               p2m_itr_arg.log_page_size = log_page_size;
25272 +               smp_mb();
25273 +               smp_call_function(&p2m_itr, &p2m_itr_arg, 1, 1);
25274 +               p2m_itr(&p2m_itr_arg);
25275 +       }
25276 +#endif 
25277 +       smp_mb();
25278 +       p2m_initialized = 1;
25279 +       printk(P2M_PREFIX "assign p2m table of [0x%016lx, 0x%016lx)\n",
25280 +              p2m_convert_min_pfn << PAGE_SHIFT,
25281 +              p2m_convert_max_pfn << PAGE_SHIFT);
25282 +       printk(P2M_PREFIX "to [0x%016lx, 0x%016lx) (%ld KBytes)\n",
25283 +              p2m_assign_start_pfn << PAGE_SHIFT,
25284 +              p2m_assign_end_pfn << PAGE_SHIFT,
25285 +              p2m_size / 1024);
25286 +out:
25287 +       unlock_cpu_hotplug();
25288 +       return error;
25289 +}
25290 +
25291 +#ifdef notyet
25292 +void
25293 +p2m_expose_cleanup(void)
25294 +{
25295 +       BUG_ON(!p2m_initialized);
25296 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
25297 +       unregister_cpu_notifier(&p2m_expose_dtr_hotplug_notifier);
25298 +#endif
25299 +       release_resource(&p2m_resource);
25300 +}
25301 +#endif
25302 +
25303 +//XXX inlinize?
25304 +unsigned long
25305 +p2m_phystomach(unsigned long gpfn)
25306 +{
25307 +       volatile const pte_t* pte;
25308 +       unsigned long mfn;
25309 +       unsigned long pteval;
25310 +       
25311 +       if (!p2m_initialized ||
25312 +           gpfn < p2m_min_low_pfn || gpfn > p2m_max_low_pfn
25313 +           /* || !pfn_valid(gpfn) */)
25314 +               return INVALID_MFN;
25315 +       pte = p2m_pte + (gpfn - p2m_convert_min_pfn);
25316 +
25317 +       mfn = INVALID_MFN;
25318 +       if (likely(__get_user(pteval, (unsigned long __user *)pte) == 0 &&
25319 +                  pte_present(__pte(pteval)) &&
25320 +                  pte_pfn(__pte(pteval)) != (INVALID_MFN >> PAGE_SHIFT)))
25321 +               mfn = (pteval & _PFN_MASK) >> PAGE_SHIFT;
25322 +
25323 +       return mfn;
25324 +}
25325 +
25326 +EXPORT_SYMBOL_GPL(p2m_initialized);
25327 +EXPORT_SYMBOL_GPL(p2m_min_low_pfn);
25328 +EXPORT_SYMBOL_GPL(p2m_max_low_pfn);
25329 +EXPORT_SYMBOL_GPL(p2m_convert_min_pfn);
25330 +EXPORT_SYMBOL_GPL(p2m_convert_max_pfn);
25331 +EXPORT_SYMBOL_GPL(p2m_pte);
25332 +EXPORT_SYMBOL_GPL(p2m_phystomach);
25333 +#endif
25334 +
25335 +///////////////////////////////////////////////////////////////////////////
25336 +// for xenoprof
25337 +
25338 +struct resource*
25339 +xen_ia64_allocate_resource(unsigned long size)
25340 +{
25341 +       struct resource* res;
25342 +       int error;
25343 +       
25344 +       res = kmalloc(sizeof(*res), GFP_KERNEL);
25345 +       if (res == NULL)
25346 +               return ERR_PTR(-ENOMEM);
25347 +
25348 +       res->name = "Xen";
25349 +       res->flags = IORESOURCE_MEM;
25350 +       error = allocate_resource(&iomem_resource, res, PAGE_ALIGN(size),
25351 +                                 privcmd_resource_min, privcmd_resource_max,
25352 +                                 IA64_GRANULE_SIZE, NULL, NULL);
25353 +       if (error) {
25354 +               kfree(res);
25355 +               return ERR_PTR(error);
25356 +       }
25357 +       return res;
25358 +}
25359 +EXPORT_SYMBOL_GPL(xen_ia64_allocate_resource);
25360 +
25361 +void
25362 +xen_ia64_release_resource(struct resource* res)
25363 +{
25364 +       release_resource(res);
25365 +       kfree(res);
25366 +}
25367 +EXPORT_SYMBOL_GPL(xen_ia64_release_resource);
25368 +
25369 +void
25370 +xen_ia64_unmap_resource(struct resource* res)
25371 +{
25372 +       unsigned long gpfn = res->start >> PAGE_SHIFT;
25373 +       unsigned long nr_pages = (res->end - res->start) >> PAGE_SHIFT;
25374 +       unsigned long i;
25375 +       
25376 +       for (i = 0; i < nr_pages; i++) {
25377 +               int error = HYPERVISOR_zap_physmap(gpfn + i, 0);
25378 +               if (error)
25379 +                       printk(KERN_ERR
25380 +                              "%s:%d zap_phsymap failed %d gpfn %lx\n",
25381 +                              __func__, __LINE__, error, gpfn + i);
25382 +       }
25383 +       xen_ia64_release_resource(res);
25384 +}
25385 +EXPORT_SYMBOL_GPL(xen_ia64_unmap_resource);
25386 diff -ruNp linux-2.6.19/arch/ia64/xen/mem.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/mem.c
25387 --- linux-2.6.19/arch/ia64/xen/mem.c    1970-01-01 00:00:00.000000000 +0000
25388 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/mem.c  2007-02-02 19:10:21.000000000 +0000
25389 @@ -0,0 +1,76 @@
25390 +/*
25391 + *  Originally from linux/drivers/char/mem.c
25392 + *
25393 + *  Copyright (C) 1991, 1992  Linus Torvalds
25394 + *
25395 + *  Added devfs support. 
25396 + *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
25397 + *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
25398 + */
25399 +/*
25400 + * taken from
25401 + * linux/drivers/char/mem.c and linux-2.6-xen-sparse/drivers/xen/char/mem.c.
25402 + * adjusted for IA64 and made transparent.
25403 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
25404 + *                    VA Linux Systems Japan K.K.
25405 + */
25406 +
25407 +#include <linux/config.h>
25408 +#include <linux/mm.h>
25409 +#include <linux/efi.h>
25410 +
25411 +/*
25412 + * Architectures vary in how they handle caching for addresses
25413 + * outside of main memory.
25414 + *
25415 + */
25416 +static inline int uncached_access(struct file *file, unsigned long addr)
25417 +{
25418 +       /*
25419 +        * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases.
25420 +        */
25421 +       return !(efi_mem_attributes(addr) & EFI_MEMORY_WB);
25422 +}
25423 +
25424 +int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
25425 +{
25426 +       unsigned long addr = vma->vm_pgoff << PAGE_SHIFT;
25427 +       size_t size = vma->vm_end - vma->vm_start;
25428 +
25429 +
25430 +#if 0
25431 +       /*
25432 +        *XXX FIXME: linux-2.6.16.29, linux-2.6.17
25433 +        *    valid_mmap_phys_addr_range() in linux/arch/ia64/kernel/efi.c
25434 +        *    fails checks.
25435 +        *    linux-2.6.18.1's returns always 1. 
25436 +        *    Its comments says
25437 +        *
25438 +         * MMIO regions are often missing from the EFI memory map.
25439 +         * We must allow mmap of them for programs like X, so we
25440 +         * currently can't do any useful validation.
25441 +         */
25442 +       if (!valid_mmap_phys_addr_range(addr, &size))
25443 +               return -EINVAL;
25444 +       if (size < vma->vm_end - vma->vm_start)
25445 +               return -EINVAL;
25446 +#endif
25447 +
25448 +       if (is_running_on_xen()) {
25449 +               unsigned long offset = HYPERVISOR_ioremap(addr, size);
25450 +               if (IS_ERR_VALUE(offset))
25451 +                       return offset;
25452 +       }
25453 +
25454 +       if (uncached_access(file, vma->vm_pgoff << PAGE_SHIFT))
25455 +               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
25456 +
25457 +        /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
25458 +        if (remap_pfn_range(vma,
25459 +                            vma->vm_start,
25460 +                            vma->vm_pgoff,
25461 +                            size,
25462 +                            vma->vm_page_prot))
25463 +                return -EAGAIN;
25464 +        return 0;
25465 +}
25466 diff -ruNp linux-2.6.19/arch/ia64/xen/util.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/util.c
25467 --- linux-2.6.19/arch/ia64/xen/util.c   1970-01-01 00:00:00.000000000 +0000
25468 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/util.c 2007-02-02 19:10:21.000000000 +0000
25469 @@ -0,0 +1,117 @@
25470 +/******************************************************************************
25471 + * arch/ia64/xen/util.c
25472 + * This file is the ia64 counterpart of drivers/xen/util.c
25473 + *
25474 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
25475 + *                    VA Linux Systems Japan K.K.
25476 + *
25477 + * This program is free software; you can redistribute it and/or modify
25478 + * it under the terms of the GNU General Public License as published by
25479 + * the Free Software Foundation; either version 2 of the License, or
25480 + * (at your option) any later version.
25481 + *
25482 + * This program is distributed in the hope that it will be useful,
25483 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
25484 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25485 + * GNU General Public License for more details.
25486 + *
25487 + * You should have received a copy of the GNU General Public License
25488 + * along with this program; if not, write to the Free Software
25489 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25490 + *
25491 + */
25492 +
25493 +#include <linux/mm.h>
25494 +#include <linux/module.h>
25495 +#include <linux/slab.h>
25496 +#include <linux/vmalloc.h>
25497 +#include <asm/uaccess.h>
25498 +#include <xen/driver_util.h>
25499 +#include <xen/interface/memory.h>
25500 +#include <asm/hypercall.h>
25501 +
25502 +struct vm_struct *alloc_vm_area(unsigned long size)
25503 +{
25504 +       int order;
25505 +       unsigned long virt;
25506 +       unsigned long nr_pages;
25507 +       struct vm_struct* area;
25508 +       
25509 +       order = get_order(size);
25510 +       virt = __get_free_pages(GFP_KERNEL, order);
25511 +       if (virt == 0) {
25512 +               goto err0;
25513 +       }
25514 +       nr_pages = 1 << order;
25515 +       scrub_pages(virt, nr_pages);
25516 +       
25517 +       area = kmalloc(sizeof(*area), GFP_KERNEL);
25518 +       if (area == NULL) {
25519 +               goto err1;
25520 +       }
25521 +       
25522 +        area->flags = VM_IOREMAP;//XXX
25523 +        area->addr = (void*)virt;
25524 +        area->size = size;
25525 +        area->pages = NULL; //XXX
25526 +        area->nr_pages = nr_pages;
25527 +        area->phys_addr = 0;   /* xenbus_map_ring_valloc uses this field!  */
25528 +
25529 +       return area;
25530 +
25531 +err1:
25532 +       free_pages(virt, order);
25533 +err0:
25534 +       return NULL;
25535 +       
25536 +}
25537 +EXPORT_SYMBOL_GPL(alloc_vm_area);
25538 +
25539 +void free_vm_area(struct vm_struct *area)
25540 +{
25541 +       unsigned int order = get_order(area->size);
25542 +       unsigned long i;
25543 +       unsigned long phys_addr = __pa(area->addr);
25544 +
25545 +       // This area is used for foreign page mappping.
25546 +       // So underlying machine page may not be assigned.
25547 +       for (i = 0; i < (1 << order); i++) {
25548 +               unsigned long ret;
25549 +               unsigned long gpfn = (phys_addr >> PAGE_SHIFT) + i;
25550 +               struct xen_memory_reservation reservation = {
25551 +                       .nr_extents   = 1,
25552 +                       .address_bits = 0,
25553 +                       .extent_order = 0,
25554 +                       .domid        = DOMID_SELF
25555 +               };
25556 +               set_xen_guest_handle(reservation.extent_start, &gpfn);
25557 +               ret = HYPERVISOR_memory_op(XENMEM_populate_physmap,
25558 +                                          &reservation);
25559 +               BUG_ON(ret != 1);
25560 +       }
25561 +       free_pages((unsigned long)area->addr, order);
25562 +       kfree(area);
25563 +}
25564 +EXPORT_SYMBOL_GPL(free_vm_area);
25565 +
25566 +void lock_vm_area(struct vm_struct *area)
25567 +{
25568 +       // nothing
25569 +}
25570 +EXPORT_SYMBOL_GPL(lock_vm_area);
25571 +
25572 +void unlock_vm_area(struct vm_struct *area)
25573 +{
25574 +       // nothing
25575 +}
25576 +EXPORT_SYMBOL_GPL(unlock_vm_area);
25577 +
25578 +/*
25579 + * Local variables:
25580 + *  c-file-style: "linux"
25581 + *  indent-tabs-mode: t
25582 + *  c-indent-level: 8
25583 + *  c-basic-offset: 8
25584 + *  tab-width: 8
25585 + * End:
25586 + */
25587 diff -ruNp linux-2.6.19/arch/ia64/xen/xcom_hcall.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/xcom_hcall.c
25588 --- linux-2.6.19/arch/ia64/xen/xcom_hcall.c     1970-01-01 00:00:00.000000000 +0000
25589 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xcom_hcall.c   2007-02-02 19:10:21.000000000 +0000
25590 @@ -0,0 +1,365 @@
25591 +/*
25592 + * This program is free software; you can redistribute it and/or modify
25593 + * it under the terms of the GNU General Public License as published by
25594 + * the Free Software Foundation; either version 2 of the License, or
25595 + * (at your option) any later version.
25596 + *
25597 + * This program is distributed in the hope that it will be useful,
25598 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
25599 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25600 + * GNU General Public License for more details.
25601 + *
25602 + * You should have received a copy of the GNU General Public License
25603 + * along with this program; if not, write to the Free Software
25604 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
25605 + *
25606 + *          Tristan Gingold <tristan.gingold@bull.net>
25607 + */
25608 +#include <linux/types.h>
25609 +#include <linux/errno.h>
25610 +#include <linux/kernel.h>
25611 +#include <linux/gfp.h>
25612 +#include <linux/module.h>
25613 +#include <xen/interface/xen.h>
25614 +#include <xen/interface/dom0_ops.h>
25615 +#include <xen/interface/memory.h>
25616 +#include <xen/interface/xencomm.h>
25617 +#include <xen/interface/version.h>
25618 +#include <xen/interface/sched.h>
25619 +#include <xen/interface/event_channel.h>
25620 +#include <xen/interface/physdev.h>
25621 +#include <xen/interface/grant_table.h>
25622 +#include <xen/interface/callback.h>
25623 +#include <xen/interface/acm_ops.h>
25624 +#include <xen/interface/hvm/params.h>
25625 +#include <xen/interface/xenoprof.h>
25626 +#include <asm/hypercall.h>
25627 +#include <asm/page.h>
25628 +#include <asm/uaccess.h>
25629 +#include <asm/xen/xencomm.h>
25630 +#include <asm/perfmon.h>
25631 +
25632 +/* Xencomm notes:
25633 + * This file defines hypercalls to be used by xencomm.  The hypercalls simply
25634 + * create inlines descriptors for pointers and then call the raw arch hypercall
25635 + * xencomm_arch_hypercall_XXX
25636 + *
25637 + * If the arch wants to directly use these hypercalls, simply define macros
25638 + * in asm/hypercall.h, eg:
25639 + *  #define HYPERVISOR_sched_op xencomm_hypercall_sched_op
25640 + * 
25641 + * The arch may also define HYPERVISOR_xxx as a function and do more operations
25642 + * before/after doing the hypercall.
25643 + *
25644 + * Note: because only inline descriptors are created these functions must only
25645 + * be called with in kernel memory parameters.
25646 + */
25647 +
25648 +int
25649 +xencomm_hypercall_console_io(int cmd, int count, char *str)
25650 +{
25651 +       return xencomm_arch_hypercall_console_io
25652 +               (cmd, count, xencomm_create_inline(str));
25653 +}
25654 +
25655 +int
25656 +xencomm_hypercall_event_channel_op(int cmd, void *op)
25657 +{
25658 +       return xencomm_arch_hypercall_event_channel_op
25659 +               (cmd, xencomm_create_inline(op));
25660 +}
25661 +
25662 +int
25663 +xencomm_hypercall_xen_version(int cmd, void *arg)
25664 +{
25665 +       switch (cmd) {
25666 +       case XENVER_version:
25667 +       case XENVER_extraversion:
25668 +       case XENVER_compile_info:
25669 +       case XENVER_capabilities:
25670 +       case XENVER_changeset:
25671 +       case XENVER_platform_parameters:
25672 +       case XENVER_pagesize:
25673 +       case XENVER_get_features:
25674 +               break;
25675 +       default:
25676 +               printk("%s: unknown version cmd %d\n", __func__, cmd);
25677 +               return -ENOSYS;
25678 +       }
25679 +
25680 +       return xencomm_arch_hypercall_xen_version
25681 +               (cmd, xencomm_create_inline(arg));
25682 +}
25683 +
25684 +int
25685 +xencomm_hypercall_physdev_op(int cmd, void *op)
25686 +{
25687 +       return xencomm_arch_hypercall_physdev_op
25688 +               (cmd, xencomm_create_inline(op));
25689 +}
25690 +
25691 +static void *
25692 +xencommize_grant_table_op(unsigned int cmd, void *op, unsigned int count)
25693 +{
25694 +       switch (cmd) {
25695 +       case GNTTABOP_map_grant_ref:
25696 +       case GNTTABOP_unmap_grant_ref:
25697 +               break;
25698 +       case GNTTABOP_setup_table:
25699 +       {
25700 +               struct gnttab_setup_table *setup = op;
25701 +               struct xencomm_handle *frame_list;
25702 +
25703 +               frame_list = xencomm_create_inline
25704 +                       (xen_guest_handle(setup->frame_list));
25705 +
25706 +               set_xen_guest_handle(setup->frame_list, (void *)frame_list);
25707 +               break;
25708 +       }
25709 +       case GNTTABOP_dump_table:
25710 +       case GNTTABOP_transfer:
25711 +       case GNTTABOP_copy:
25712 +               break;
25713 +       default:
25714 +               printk("%s: unknown grant table op %d\n", __func__, cmd);
25715 +               BUG();
25716 +       }
25717 +
25718 +       return  xencomm_create_inline(op);
25719 +}
25720 +
25721 +int
25722 +xencomm_hypercall_grant_table_op(unsigned int cmd, void *op, unsigned int count)
25723 +{
25724 +       void *desc = xencommize_grant_table_op (cmd, op, count);
25725 +
25726 +       return xencomm_arch_hypercall_grant_table_op(cmd, desc, count);
25727 +}
25728 +
25729 +int
25730 +xencomm_hypercall_sched_op(int cmd, void *arg)
25731 +{
25732 +       switch (cmd) {
25733 +       case SCHEDOP_yield:
25734 +       case SCHEDOP_block:
25735 +       case SCHEDOP_shutdown:
25736 +       case SCHEDOP_remote_shutdown:
25737 +               break;
25738 +       case SCHEDOP_poll:
25739 +       {
25740 +               sched_poll_t *poll = arg;
25741 +               struct xencomm_handle *ports;
25742 +
25743 +               ports = xencomm_create_inline(xen_guest_handle(poll->ports));
25744 +
25745 +               set_xen_guest_handle(poll->ports, (void *)ports);
25746 +               break;
25747 +       }
25748 +       default:
25749 +               printk("%s: unknown sched op %d\n", __func__, cmd);
25750 +               return -ENOSYS;
25751 +       }
25752 +       
25753 +       return xencomm_arch_hypercall_sched_op(cmd, xencomm_create_inline(arg));
25754 +}
25755 +
25756 +int
25757 +xencomm_hypercall_multicall(void *call_list, int nr_calls)
25758 +{
25759 +       int i;
25760 +       multicall_entry_t *mce;
25761 +
25762 +       for (i = 0; i < nr_calls; i++) {
25763 +               mce = (multicall_entry_t *)call_list + i;
25764 +
25765 +               switch (mce->op) {
25766 +               case __HYPERVISOR_update_va_mapping:
25767 +               case __HYPERVISOR_mmu_update:
25768 +                       /* No-op on ia64.  */
25769 +                       break;
25770 +               case __HYPERVISOR_grant_table_op:
25771 +                       mce->args[1] = (unsigned long)xencommize_grant_table_op
25772 +                               (mce->args[0], (void *)mce->args[1],
25773 +                                mce->args[2]);
25774 +                       break;
25775 +               case __HYPERVISOR_memory_op:
25776 +               default:
25777 +                       printk("%s: unhandled multicall op entry op %lu\n",
25778 +                              __func__, mce->op);
25779 +                       return -ENOSYS;
25780 +               }
25781 +       }
25782 +
25783 +       return xencomm_arch_hypercall_multicall
25784 +               (xencomm_create_inline(call_list), nr_calls);
25785 +}
25786 +
25787 +int
25788 +xencomm_hypercall_callback_op(int cmd, void *arg)
25789 +{
25790 +       switch (cmd)
25791 +       {
25792 +       case CALLBACKOP_register:
25793 +       case CALLBACKOP_unregister:
25794 +               break;
25795 +       default:
25796 +               printk("%s: unknown callback op %d\n", __func__, cmd);
25797 +               return -ENOSYS;
25798 +       }
25799 +
25800 +       return xencomm_arch_hypercall_callback_op
25801 +               (cmd, xencomm_create_inline(arg));
25802 +}
25803 +
25804 +static void
25805 +xencommize_memory_reservation (xen_memory_reservation_t *mop)
25806 +{
25807 +       struct xencomm_handle *desc;
25808 +
25809 +       desc = xencomm_create_inline(xen_guest_handle(mop->extent_start));
25810 +       set_xen_guest_handle(mop->extent_start, (void *)desc);
25811 +}
25812 +
25813 +int
25814 +xencomm_hypercall_memory_op(unsigned int cmd, void *arg)
25815 +{
25816 +       XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2];
25817 +       xen_memory_reservation_t *xmr = NULL, *xme_in = NULL, *xme_out = NULL;
25818 +       int rc;
25819 +
25820 +       switch (cmd) {
25821 +       case XENMEM_increase_reservation:
25822 +       case XENMEM_decrease_reservation:
25823 +       case XENMEM_populate_physmap:
25824 +               xmr = (xen_memory_reservation_t *)arg;
25825 +               xen_guest_handle(extent_start_va[0]) =
25826 +                       xen_guest_handle(xmr->extent_start);
25827 +               xencommize_memory_reservation((xen_memory_reservation_t *)arg);
25828 +               break;
25829 +               
25830 +       case XENMEM_maximum_ram_page:
25831 +               break;
25832 +
25833 +       case XENMEM_exchange:
25834 +               xme_in  = &((xen_memory_exchange_t *)arg)->in;
25835 +               xme_out = &((xen_memory_exchange_t *)arg)->out;
25836 +               xen_guest_handle(extent_start_va[0]) =
25837 +                       xen_guest_handle(xme_in->extent_start);
25838 +               xen_guest_handle(extent_start_va[1]) =
25839 +                       xen_guest_handle(xme_out->extent_start);
25840 +               xencommize_memory_reservation
25841 +                       (&((xen_memory_exchange_t *)arg)->in);
25842 +               xencommize_memory_reservation
25843 +                       (&((xen_memory_exchange_t *)arg)->out);
25844 +               break;
25845 +
25846 +       default:
25847 +               printk("%s: unknown memory op %d\n", __func__, cmd);
25848 +               return -ENOSYS;
25849 +       }
25850 +
25851 +       rc =  xencomm_arch_hypercall_memory_op(cmd, xencomm_create_inline(arg));
25852 +
25853 +       switch (cmd) {
25854 +       case XENMEM_increase_reservation:
25855 +       case XENMEM_decrease_reservation:
25856 +       case XENMEM_populate_physmap:
25857 +               xen_guest_handle(xmr->extent_start) =
25858 +                       xen_guest_handle(extent_start_va[0]);
25859 +               break;
25860 +
25861 +       case XENMEM_exchange:
25862 +               xen_guest_handle(xme_in->extent_start) =
25863 +                       xen_guest_handle(extent_start_va[0]);
25864 +               xen_guest_handle(xme_out->extent_start) =
25865 +                       xen_guest_handle(extent_start_va[1]);
25866 +               break;
25867 +       }
25868 +
25869 +       return rc;
25870 +}
25871 +
25872 +unsigned long
25873 +xencomm_hypercall_hvm_op(int cmd, void *arg)
25874 +{
25875 +       switch (cmd) {
25876 +       case HVMOP_set_param:
25877 +       case HVMOP_get_param:
25878 +               break;
25879 +       default:
25880 +               printk("%s: unknown hvm op %d\n", __func__, cmd);
25881 +               return -ENOSYS;
25882 +       }
25883 +
25884 +       return xencomm_arch_hypercall_hvm_op(cmd, xencomm_create_inline(arg));
25885 +}
25886 +
25887 +int
25888 +xencomm_hypercall_suspend(unsigned long srec)
25889 +{
25890 +       struct sched_shutdown arg;
25891 +
25892 +       arg.reason = SHUTDOWN_suspend;
25893 +
25894 +       return xencomm_arch_hypercall_suspend(xencomm_create_inline(&arg));
25895 +}
25896 +
25897 +int
25898 +xencomm_hypercall_xenoprof_op(int op, void *arg)
25899 +{
25900 +       switch (op) {
25901 +       case XENOPROF_init:
25902 +       case XENOPROF_set_active:
25903 +       case XENOPROF_set_passive:
25904 +       case XENOPROF_counter:
25905 +       case XENOPROF_get_buffer:
25906 +               break;
25907 +
25908 +       case XENOPROF_reset_active_list:
25909 +       case XENOPROF_reset_passive_list:
25910 +       case XENOPROF_reserve_counters:
25911 +       case XENOPROF_setup_events:
25912 +       case XENOPROF_enable_virq:
25913 +       case XENOPROF_start:
25914 +       case XENOPROF_stop:
25915 +       case XENOPROF_disable_virq:
25916 +       case XENOPROF_release_counters:
25917 +       case XENOPROF_shutdown:
25918 +               return xencomm_arch_hypercall_xenoprof_op(op, arg);
25919 +               break;
25920 +
25921 +       default:
25922 +               printk("%s: op %d isn't supported\n", __func__, op);
25923 +               return -ENOSYS;
25924 +       }
25925 +       return xencomm_arch_hypercall_xenoprof_op(op,
25926 +                                                 xencomm_create_inline(arg));
25927 +}
25928 +
25929 +int
25930 +xencomm_hypercall_perfmon_op(unsigned long cmd, void* arg, unsigned long count)
25931 +{
25932 +       switch (cmd) {
25933 +       case PFM_GET_FEATURES:
25934 +       case PFM_CREATE_CONTEXT:
25935 +       case PFM_WRITE_PMCS:
25936 +       case PFM_WRITE_PMDS:
25937 +       case PFM_LOAD_CONTEXT:
25938 +               break;
25939 +
25940 +       case PFM_DESTROY_CONTEXT:
25941 +       case PFM_UNLOAD_CONTEXT:
25942 +       case PFM_START:
25943 +       case PFM_STOP:
25944 +               return xencomm_arch_hypercall_perfmon_op(cmd, arg, count);
25945 +
25946 +       default:
25947 +               printk("%s:%d cmd %ld isn't supported\n",
25948 +                      __func__,__LINE__, cmd);
25949 +               BUG();
25950 +       }
25951 +
25952 +       return xencomm_arch_hypercall_perfmon_op(cmd,
25953 +                                                xencomm_create_inline(arg),
25954 +                                                count);
25955 +}
25956 diff -ruNp linux-2.6.19/arch/ia64/xen/xcom_mini.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/xcom_mini.c
25957 --- linux-2.6.19/arch/ia64/xen/xcom_mini.c      1970-01-01 00:00:00.000000000 +0000
25958 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xcom_mini.c    2007-02-02 19:10:21.000000000 +0000
25959 @@ -0,0 +1,417 @@
25960 +/*
25961 + * This program is free software; you can redistribute it and/or modify
25962 + * it under the terms of the GNU General Public License as published by
25963 + * the Free Software Foundation; either version 2 of the License, or
25964 + * (at your option) any later version.
25965 + *
25966 + * This program is distributed in the hope that it will be useful,
25967 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
25968 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25969 + * GNU General Public License for more details.
25970 + *
25971 + * You should have received a copy of the GNU General Public License
25972 + * along with this program; if not, write to the Free Software
25973 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
25974 + *
25975 + *          Tristan Gingold <tristan.gingold@bull.net>
25976 + */
25977 +#include <linux/types.h>
25978 +#include <linux/errno.h>
25979 +#include <linux/kernel.h>
25980 +#include <linux/module.h>
25981 +#include <xen/interface/xen.h>
25982 +#include <xen/interface/dom0_ops.h>
25983 +#include <xen/interface/memory.h>
25984 +#include <xen/interface/xencomm.h>
25985 +#include <xen/interface/version.h>
25986 +#include <xen/interface/event_channel.h>
25987 +#include <xen/interface/physdev.h>
25988 +#include <xen/interface/grant_table.h>
25989 +#include <xen/interface/hvm/params.h>
25990 +#include <xen/interface/xenoprof.h>
25991 +#ifdef CONFIG_VMX_GUEST
25992 +#include <asm/hypervisor.h>
25993 +#else
25994 +#include <asm/hypercall.h>
25995 +#endif
25996 +#include <asm/xen/xencomm.h>
25997 +#include <asm/perfmon.h>
25998 +
25999 +int
26000 +xencomm_mini_hypercall_event_channel_op(int cmd, void *op)
26001 +{
26002 +       struct xencomm_mini xc_area[2];
26003 +       int nbr_area = 2;
26004 +       struct xencomm_handle *desc;
26005 +       int rc;
26006 +
26007 +       rc = xencomm_create_mini(xc_area, &nbr_area,
26008 +                                op, sizeof(evtchn_op_t), &desc);
26009 +       if (rc)
26010 +               return rc;
26011 +
26012 +       return xencomm_arch_hypercall_event_channel_op(cmd, desc);
26013 +}
26014 +EXPORT_SYMBOL(xencomm_mini_hypercall_event_channel_op);
26015 +
26016 +static int
26017 +xencommize_mini_grant_table_op(struct xencomm_mini *xc_area, int *nbr_area,
26018 +                               unsigned int cmd, void *op, unsigned int count,
26019 +                               struct xencomm_handle **desc)
26020 +{
26021 +       struct xencomm_handle *desc1;
26022 +       unsigned int argsize;
26023 +       int rc;
26024 +
26025 +       switch (cmd) {
26026 +       case GNTTABOP_map_grant_ref:
26027 +               argsize = sizeof(struct gnttab_map_grant_ref);
26028 +               break;
26029 +       case GNTTABOP_unmap_grant_ref:
26030 +               argsize = sizeof(struct gnttab_unmap_grant_ref);
26031 +               break;
26032 +       case GNTTABOP_setup_table:
26033 +       {
26034 +               struct gnttab_setup_table *setup = op;
26035 +
26036 +               argsize = sizeof(*setup);
26037 +
26038 +               if (count != 1)
26039 +                       return -EINVAL;
26040 +               rc = xencomm_create_mini
26041 +                       (xc_area, nbr_area,
26042 +                        xen_guest_handle(setup->frame_list),
26043 +                        setup->nr_frames 
26044 +                        * sizeof(*xen_guest_handle(setup->frame_list)),
26045 +                        &desc1);
26046 +               if (rc)
26047 +                       return rc;
26048 +               set_xen_guest_handle(setup->frame_list, (void *)desc1);
26049 +               break;
26050 +       }
26051 +       case GNTTABOP_dump_table:
26052 +               argsize = sizeof(struct gnttab_dump_table);
26053 +               break;
26054 +       case GNTTABOP_transfer:
26055 +               argsize = sizeof(struct gnttab_transfer);
26056 +               break;
26057 +       case GNTTABOP_copy:
26058 +               argsize = sizeof(struct gnttab_copy);
26059 +               break;
26060 +       default:
26061 +               printk("%s: unknown mini grant table op %d\n", __func__, cmd);
26062 +               BUG();
26063 +       }
26064 +
26065 +       rc = xencomm_create_mini(xc_area, nbr_area, op, count * argsize, desc);
26066 +       if (rc)
26067 +               return rc;
26068 +
26069 +       return 0;
26070 +}
26071 +
26072 +int
26073 +xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op,
26074 +                                      unsigned int count)
26075 +{
26076 +       int rc;
26077 +       struct xencomm_handle *desc;
26078 +       int nbr_area = 2;
26079 +       struct xencomm_mini xc_area[2];
26080 +
26081 +       rc = xencommize_mini_grant_table_op(xc_area, &nbr_area,
26082 +                                           cmd, op, count, &desc);
26083 +       if (rc)
26084 +               return rc;
26085 +
26086 +       return xencomm_arch_hypercall_grant_table_op(cmd, desc, count);
26087 +}
26088 +EXPORT_SYMBOL(xencomm_mini_hypercall_grant_table_op);
26089 +
26090 +int
26091 +xencomm_mini_hypercall_multicall(void *call_list, int nr_calls)
26092 +{
26093 +       int i;
26094 +       multicall_entry_t *mce;
26095 +       int nbr_area = 2 + nr_calls * 3;
26096 +       struct xencomm_mini xc_area[nbr_area];
26097 +       struct xencomm_handle *desc;
26098 +       int rc;
26099 +
26100 +       for (i = 0; i < nr_calls; i++) {
26101 +               mce = (multicall_entry_t *)call_list + i;
26102 +
26103 +               switch (mce->op) {
26104 +               case __HYPERVISOR_update_va_mapping:
26105 +               case __HYPERVISOR_mmu_update:
26106 +                       /* No-op on ia64.  */
26107 +                       break;
26108 +               case __HYPERVISOR_grant_table_op:
26109 +                       rc = xencommize_mini_grant_table_op
26110 +                               (xc_area, &nbr_area,
26111 +                                mce->args[0], (void *)mce->args[1],
26112 +                                mce->args[2], &desc);
26113 +                       if (rc)
26114 +                               return rc;
26115 +                       mce->args[1] = (unsigned long)desc;
26116 +                       break;
26117 +               case __HYPERVISOR_memory_op:
26118 +               default:
26119 +                       printk("%s: unhandled multicall op entry op %lu\n",
26120 +                              __func__, mce->op);
26121 +                       return -ENOSYS;
26122 +               }
26123 +       }
26124 +
26125 +       rc = xencomm_create_mini(xc_area, &nbr_area, call_list,
26126 +                                nr_calls * sizeof(multicall_entry_t), &desc);
26127 +       if (rc)
26128 +               return rc;
26129 +
26130 +       return xencomm_arch_hypercall_multicall(desc, nr_calls);
26131 +}
26132 +EXPORT_SYMBOL(xencomm_mini_hypercall_multicall);
26133 +
26134 +static int
26135 +xencommize_mini_memory_reservation(struct xencomm_mini *area, int *nbr_area,
26136 +                                   xen_memory_reservation_t *mop)
26137 +{
26138 +       struct xencomm_handle *desc;
26139 +       int rc;
26140 +
26141 +       rc = xencomm_create_mini
26142 +               (area, nbr_area,
26143 +                xen_guest_handle(mop->extent_start),
26144 +                mop->nr_extents 
26145 +                * sizeof(*xen_guest_handle(mop->extent_start)),
26146 +                &desc);
26147 +       if (rc)
26148 +               return rc;
26149 +
26150 +       set_xen_guest_handle(mop->extent_start, (void *)desc);
26151 +
26152 +       return 0;
26153 +}
26154 +
26155 +int
26156 +xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg)
26157 +{
26158 +       int nbr_area = 4;
26159 +       struct xencomm_mini xc_area[4];
26160 +       struct xencomm_handle *desc;
26161 +       int rc;
26162 +       unsigned int argsize;
26163 +
26164 +       switch (cmd) {
26165 +       case XENMEM_increase_reservation:
26166 +       case XENMEM_decrease_reservation:
26167 +       case XENMEM_populate_physmap:
26168 +               argsize = sizeof(xen_memory_reservation_t);
26169 +               rc = xencommize_mini_memory_reservation
26170 +                       (xc_area, &nbr_area, (xen_memory_reservation_t *)arg);
26171 +               if (rc)
26172 +                       return rc;
26173 +               break;
26174 +               
26175 +       case XENMEM_maximum_ram_page:
26176 +               argsize = 0;
26177 +               break;
26178 +
26179 +       case XENMEM_exchange:
26180 +               argsize = sizeof(xen_memory_exchange_t);
26181 +               rc = xencommize_mini_memory_reservation
26182 +                       (xc_area, &nbr_area,
26183 +                        &((xen_memory_exchange_t *)arg)->in);
26184 +               if (rc)
26185 +                       return rc;
26186 +               rc = xencommize_mini_memory_reservation
26187 +                       (xc_area, &nbr_area,
26188 +                        &((xen_memory_exchange_t *)arg)->out);
26189 +               if (rc)
26190 +                       return rc;
26191 +               break;
26192 +
26193 +       case XENMEM_add_to_physmap:
26194 +               argsize = sizeof (xen_add_to_physmap_t);
26195 +               break;
26196 +
26197 +       default:
26198 +               printk("%s: unknown mini memory op %d\n", __func__, cmd);
26199 +               return -ENOSYS;
26200 +       }
26201 +
26202 +       rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26203 +       if (rc)
26204 +               return rc;
26205 +
26206 +       return xencomm_arch_hypercall_memory_op(cmd, desc);
26207 +}
26208 +EXPORT_SYMBOL(xencomm_mini_hypercall_memory_op);
26209 +
26210 +unsigned long
26211 +xencomm_mini_hypercall_hvm_op(int cmd, void *arg)
26212 +{
26213 +       struct xencomm_handle *desc;
26214 +       int nbr_area = 2;
26215 +       struct xencomm_mini xc_area[2];
26216 +       unsigned int argsize;
26217 +       int rc;
26218 +
26219 +       switch (cmd) {
26220 +       case HVMOP_get_param:
26221 +       case HVMOP_set_param:
26222 +               argsize = sizeof(xen_hvm_param_t);
26223 +               break;
26224 +       default:
26225 +               printk("%s: unknown HVMOP %d\n", __func__, cmd);
26226 +               return -EINVAL;
26227 +       }
26228 +
26229 +       rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26230 +       if (rc)
26231 +               return rc;
26232 +
26233 +       return xencomm_arch_hypercall_hvm_op(cmd, desc);
26234 +}
26235 +EXPORT_SYMBOL(xencomm_mini_hypercall_hvm_op);
26236 +
26237 +int
26238 +xencomm_mini_hypercall_xen_version(int cmd, void *arg)
26239 +{
26240 +       struct xencomm_handle *desc;
26241 +       int nbr_area = 2;
26242 +       struct xencomm_mini xc_area[2];
26243 +       unsigned int argsize;
26244 +       int rc;
26245 +
26246 +       switch (cmd) {
26247 +       case XENVER_version:
26248 +               /* do not actually pass an argument */
26249 +               return xencomm_arch_hypercall_xen_version(cmd, 0);
26250 +       case XENVER_extraversion:
26251 +               argsize = sizeof(xen_extraversion_t);
26252 +               break;
26253 +       case XENVER_compile_info:
26254 +               argsize = sizeof(xen_compile_info_t);
26255 +               break;
26256 +       case XENVER_capabilities:
26257 +               argsize = sizeof(xen_capabilities_info_t);
26258 +               break;
26259 +       case XENVER_changeset:
26260 +               argsize = sizeof(xen_changeset_info_t);
26261 +               break;
26262 +       case XENVER_platform_parameters:
26263 +               argsize = sizeof(xen_platform_parameters_t);
26264 +               break;
26265 +       case XENVER_pagesize:
26266 +               argsize = (arg == NULL) ? 0 : sizeof(void *);
26267 +               break;
26268 +       case XENVER_get_features:
26269 +               argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t);
26270 +               break;
26271 +
26272 +       default:
26273 +               printk("%s: unknown version op %d\n", __func__, cmd);
26274 +               return -ENOSYS;
26275 +       }
26276 +
26277 +       rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26278 +       if (rc)
26279 +               return rc;
26280 +
26281 +       return xencomm_arch_hypercall_xen_version(cmd, desc);
26282 +}
26283 +EXPORT_SYMBOL(xencomm_mini_hypercall_xen_version);
26284 +
26285 +int
26286 +xencomm_mini_hypercall_xenoprof_op(int op, void *arg)
26287 +{
26288 +       unsigned int argsize;
26289 +       struct xencomm_mini xc_area[2];
26290 +       int nbr_area = 2;
26291 +       struct xencomm_handle *desc;
26292 +       int rc;
26293 +
26294 +       switch (op) {
26295 +       case XENOPROF_init:
26296 +               argsize = sizeof(xenoprof_init_t);
26297 +               break;
26298 +       case XENOPROF_set_active:
26299 +               argsize = sizeof(domid_t);
26300 +               break;
26301 +       case XENOPROF_set_passive:
26302 +               argsize = sizeof(xenoprof_passive_t);
26303 +               break;
26304 +       case XENOPROF_counter:
26305 +               argsize = sizeof(xenoprof_counter_t);
26306 +               break;
26307 +       case XENOPROF_get_buffer:
26308 +               argsize = sizeof(xenoprof_get_buffer_t);
26309 +               break;
26310 +
26311 +       case XENOPROF_reset_active_list:
26312 +       case XENOPROF_reset_passive_list:
26313 +       case XENOPROF_reserve_counters:
26314 +       case XENOPROF_setup_events:
26315 +       case XENOPROF_enable_virq:
26316 +       case XENOPROF_start:
26317 +       case XENOPROF_stop:
26318 +       case XENOPROF_disable_virq:
26319 +       case XENOPROF_release_counters:
26320 +       case XENOPROF_shutdown:
26321 +               return xencomm_arch_hypercall_xenoprof_op(op, arg);
26322 +
26323 +       default:
26324 +               printk("%s: op %d isn't supported\n", __func__, op);
26325 +               return -ENOSYS;
26326 +       }
26327 +       rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26328 +       if (rc)
26329 +               return rc;
26330 +       return xencomm_arch_hypercall_xenoprof_op(op, desc);
26331 +}
26332 +EXPORT_SYMBOL_GPL(xencomm_mini_hypercall_xenoprof_op);
26333 +
26334 +int
26335 +xencomm_mini_hypercall_perfmon_op(unsigned long cmd, void* arg,
26336 +                                  unsigned long count)
26337 +{
26338 +       unsigned int argsize;
26339 +       struct xencomm_mini xc_area[2];
26340 +       int nbr_area = 2;
26341 +       struct xencomm_handle *desc;
26342 +       int rc;
26343 +
26344 +       switch (cmd) {
26345 +       case PFM_GET_FEATURES:
26346 +               argsize = sizeof(pfarg_features_t);
26347 +               break;
26348 +       case PFM_CREATE_CONTEXT:
26349 +               argsize = sizeof(pfarg_context_t);
26350 +               break;
26351 +       case PFM_LOAD_CONTEXT:
26352 +               argsize = sizeof(pfarg_load_t);
26353 +               break;
26354 +       case PFM_WRITE_PMCS:
26355 +       case PFM_WRITE_PMDS:
26356 +               argsize = sizeof(pfarg_reg_t) * count;
26357 +               break;
26358 +
26359 +       case PFM_DESTROY_CONTEXT:
26360 +       case PFM_UNLOAD_CONTEXT:
26361 +       case PFM_START:
26362 +       case PFM_STOP:
26363 +               return xencomm_arch_hypercall_perfmon_op(cmd, arg, count);
26364 +
26365 +       default:
26366 +               printk("%s:%d cmd %ld isn't supported\n",
26367 +                      __func__, __LINE__, cmd);
26368 +               BUG();
26369 +       }
26370 +
26371 +       rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26372 +       if (rc)
26373 +               return rc;
26374 +       return xencomm_arch_hypercall_perfmon_op(cmd, desc, count);
26375 +}
26376 +EXPORT_SYMBOL_GPL(xencomm_mini_hypercall_perfmon_op);
26377 diff -ruNp linux-2.6.19/arch/ia64/xen/xcom_privcmd.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/xcom_privcmd.c
26378 --- linux-2.6.19/arch/ia64/xen/xcom_privcmd.c   1970-01-01 00:00:00.000000000 +0000
26379 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xcom_privcmd.c 2007-02-02 19:10:21.000000000 +0000
26380 @@ -0,0 +1,663 @@
26381 +/*
26382 + * This program is free software; you can redistribute it and/or modify
26383 + * it under the terms of the GNU General Public License as published by
26384 + * the Free Software Foundation; either version 2 of the License, or
26385 + * (at your option) any later version.
26386 + *
26387 + * This program is distributed in the hope that it will be useful,
26388 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
26389 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26390 + * GNU General Public License for more details.
26391 + *
26392 + * You should have received a copy of the GNU General Public License
26393 + * along with this program; if not, write to the Free Software
26394 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
26395 + *
26396 + * Authors: Hollis Blanchard <hollisb@us.ibm.com>
26397 + *          Tristan Gingold <tristan.gingold@bull.net>
26398 + */
26399 +#include <linux/types.h>
26400 +#include <linux/errno.h>
26401 +#include <linux/kernel.h>
26402 +#include <linux/gfp.h>
26403 +#include <linux/module.h>
26404 +#include <xen/interface/xen.h>
26405 +#include <xen/interface/dom0_ops.h>
26406 +#define __XEN__
26407 +#include <xen/interface/domctl.h>
26408 +#include <xen/interface/sysctl.h>
26409 +#include <xen/interface/memory.h>
26410 +#include <xen/interface/version.h>
26411 +#include <xen/interface/event_channel.h>
26412 +#include <xen/interface/acm_ops.h>
26413 +#include <xen/interface/hvm/params.h>
26414 +#include <xen/public/privcmd.h>
26415 +#include <asm/hypercall.h>
26416 +#include <asm/page.h>
26417 +#include <asm/uaccess.h>
26418 +#include <asm/xen/xencomm.h>
26419 +
26420 +#define ROUND_DIV(v,s) (((v) + (s) - 1) / (s))
26421 +
26422 +static int
26423 +xencomm_privcmd_dom0_op(privcmd_hypercall_t *hypercall)
26424 +{
26425 +       dom0_op_t kern_op;
26426 +       dom0_op_t __user *user_op = (dom0_op_t __user *)hypercall->arg[0];
26427 +       struct xencomm_handle *op_desc;
26428 +       struct xencomm_handle *desc = NULL;
26429 +       int ret = 0;
26430 +
26431 +       if (copy_from_user(&kern_op, user_op, sizeof(dom0_op_t)))
26432 +               return -EFAULT;
26433 +
26434 +       if (kern_op.interface_version != DOM0_INTERFACE_VERSION)
26435 +               return -EACCES;
26436 +
26437 +       op_desc = xencomm_create_inline(&kern_op);
26438 +
26439 +       switch (kern_op.cmd) {
26440 +       default:
26441 +               printk("%s: unknown dom0 cmd %d\n", __func__, kern_op.cmd);
26442 +               return -ENOSYS;
26443 +       }
26444 +
26445 +       if (ret) {
26446 +               /* error mapping the nested pointer */
26447 +               return ret;
26448 +       }
26449 +
26450 +       ret = xencomm_arch_hypercall_dom0_op(op_desc);
26451 +
26452 +       /* FIXME: should we restore the handle?  */
26453 +       if (copy_to_user(user_op, &kern_op, sizeof(dom0_op_t)))
26454 +               ret = -EFAULT;
26455 +
26456 +       if (desc)
26457 +               xencomm_free(desc);
26458 +       return ret;
26459 +}
26460 +
26461 +/*
26462 + * Temporarily disable the NUMA PHYSINFO code until the rest of the
26463 + * changes are upstream.
26464 + */
26465 +#undef IA64_NUMA_PHYSINFO
26466 +
26467 +static int
26468 +xencomm_privcmd_sysctl(privcmd_hypercall_t *hypercall)
26469 +{
26470 +       xen_sysctl_t kern_op;
26471 +       xen_sysctl_t __user *user_op;
26472 +       struct xencomm_handle *op_desc;
26473 +       struct xencomm_handle *desc = NULL;
26474 +       struct xencomm_handle *desc1 = NULL;
26475 +       int ret = 0;
26476 +
26477 +       user_op = (xen_sysctl_t __user *)hypercall->arg[0];
26478 +
26479 +       if (copy_from_user(&kern_op, user_op, sizeof(xen_sysctl_t)))
26480 +               return -EFAULT;
26481 +
26482 +       if (kern_op.interface_version != XEN_SYSCTL_INTERFACE_VERSION)
26483 +               return -EACCES;
26484 +
26485 +       op_desc = xencomm_create_inline(&kern_op);
26486 +
26487 +       switch (kern_op.cmd) {
26488 +       case XEN_SYSCTL_readconsole:
26489 +               ret = xencomm_create(
26490 +                       xen_guest_handle(kern_op.u.readconsole.buffer),
26491 +                       kern_op.u.readconsole.count,
26492 +                       &desc, GFP_KERNEL);
26493 +               set_xen_guest_handle(kern_op.u.readconsole.buffer,
26494 +                                    (void *)desc);
26495 +               break;
26496 +       case XEN_SYSCTL_tbuf_op:
26497 +#ifndef IA64_NUMA_PHYSINFO
26498 +       case XEN_SYSCTL_physinfo:
26499 +#endif
26500 +       case XEN_SYSCTL_sched_id:
26501 +               break;
26502 +       case XEN_SYSCTL_perfc_op:
26503 +       {
26504 +               struct xencomm_handle *tmp_desc;
26505 +               xen_sysctl_t tmp_op = {
26506 +                       .cmd = XEN_SYSCTL_perfc_op,
26507 +                       .interface_version = XEN_SYSCTL_INTERFACE_VERSION,
26508 +                       .u.perfc_op = {
26509 +                               .cmd = XEN_SYSCTL_PERFCOP_query,
26510 +                               // .desc.p = NULL,
26511 +                               // .val.p = NULL,
26512 +                       },
26513 +               };
26514 +
26515 +               if (xen_guest_handle(kern_op.u.perfc_op.desc) == NULL) {
26516 +                       if (xen_guest_handle(kern_op.u.perfc_op.val) != NULL)
26517 +                               return -EINVAL;
26518 +                       break;
26519 +               }
26520 +
26521 +               /* query the buffer size for xencomm */
26522 +               tmp_desc = xencomm_create_inline(&tmp_op);
26523 +               ret = xencomm_arch_hypercall_sysctl(tmp_desc);
26524 +               if (ret)
26525 +                       return ret;
26526 +
26527 +               ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.desc),
26528 +                                    tmp_op.u.perfc_op.nr_counters *
26529 +                                    sizeof(xen_sysctl_perfc_desc_t),
26530 +                                    &desc, GFP_KERNEL);
26531 +               if (ret)
26532 +                       return ret;
26533 +
26534 +               set_xen_guest_handle(kern_op.u.perfc_op.desc, (void *)desc);
26535 +
26536 +               ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.val),
26537 +                                    tmp_op.u.perfc_op.nr_vals *
26538 +                                    sizeof(xen_sysctl_perfc_val_t),
26539 +                                    &desc1, GFP_KERNEL);
26540 +               if (ret)
26541 +                       xencomm_free(desc);
26542 +
26543 +               set_xen_guest_handle(kern_op.u.perfc_op.val, (void *)desc1);
26544 +               break;
26545 +       }
26546 +       case XEN_SYSCTL_getdomaininfolist:
26547 +               ret = xencomm_create(
26548 +                       xen_guest_handle(kern_op.u.getdomaininfolist.buffer),
26549 +                       kern_op.u.getdomaininfolist.max_domains *
26550 +                       sizeof(xen_domctl_getdomaininfo_t),
26551 +                       &desc, GFP_KERNEL);
26552 +               set_xen_guest_handle(kern_op.u.getdomaininfolist.buffer,
26553 +                                    (void *)desc);
26554 +               break;
26555 +#ifdef IA64_NUMA_PHYSINFO
26556 +       case XEN_SYSCTL_physinfo:
26557 +               ret = xencomm_create(
26558 +                       xen_guest_handle(kern_op.u.physinfo.memory_chunks),
26559 +                       PUBLIC_MAXCHUNKS * sizeof(node_data_t),
26560 +                       &desc, GFP_KERNEL);
26561 +               if (ret)
26562 +                       return ret;
26563 +               set_xen_guest_handle(kern_op.u.physinfo.memory_chunks,
26564 +                                    (void *)desc);
26565 +
26566 +               ret = xencomm_create(
26567 +                       xen_guest_handle(kern_op.u.physinfo.cpu_to_node),
26568 +                       PUBLIC_MAX_NUMNODES * sizeof(u64),
26569 +                       &desc1, GFP_KERNEL);
26570 +               if (ret)
26571 +                       xencomm_free(desc);
26572 +               set_xen_guest_handle(kern_op.u.physinfo.cpu_to_node,
26573 +                                    (void *)desc1);
26574 +               break;
26575 +#endif
26576 +       default:
26577 +               printk("%s: unknown sysctl cmd %d\n", __func__, kern_op.cmd);
26578 +               return -ENOSYS;
26579 +       }
26580 +
26581 +       if (ret) {
26582 +               /* error mapping the nested pointer */
26583 +               return ret;
26584 +       }
26585 +
26586 +       ret = xencomm_arch_hypercall_sysctl(op_desc);
26587 +
26588 +       /* FIXME: should we restore the handles?  */
26589 +       if (copy_to_user(user_op, &kern_op, sizeof(xen_sysctl_t)))
26590 +               ret = -EFAULT;
26591 +
26592 +       if (desc)
26593 +               xencomm_free(desc);
26594 +       if (desc1)
26595 +               xencomm_free(desc1);
26596 +       return ret;
26597 +}
26598 +
26599 +static int
26600 +xencomm_privcmd_domctl(privcmd_hypercall_t *hypercall)
26601 +{
26602 +       xen_domctl_t kern_op;
26603 +       xen_domctl_t __user *user_op;
26604 +       struct xencomm_handle *op_desc;
26605 +       struct xencomm_handle *desc = NULL;
26606 +       int ret = 0;
26607 +
26608 +       user_op = (xen_domctl_t __user *)hypercall->arg[0];
26609 +
26610 +       if (copy_from_user(&kern_op, user_op, sizeof(xen_domctl_t)))
26611 +               return -EFAULT;
26612 +
26613 +       if (kern_op.interface_version != XEN_DOMCTL_INTERFACE_VERSION)
26614 +               return -EACCES;
26615 +
26616 +       op_desc = xencomm_create_inline(&kern_op);
26617 +
26618 +       switch (kern_op.cmd) {
26619 +       case XEN_DOMCTL_createdomain:
26620 +       case XEN_DOMCTL_destroydomain:
26621 +       case XEN_DOMCTL_pausedomain:
26622 +       case XEN_DOMCTL_unpausedomain:
26623 +       case XEN_DOMCTL_getdomaininfo:
26624 +               break;
26625 +       case XEN_DOMCTL_getmemlist:
26626 +       {
26627 +               unsigned long nr_pages = kern_op.u.getmemlist.max_pfns;
26628 +
26629 +               ret = xencomm_create(
26630 +                       xen_guest_handle(kern_op.u.getmemlist.buffer),
26631 +                       nr_pages * sizeof(unsigned long),
26632 +                       &desc, GFP_KERNEL);
26633 +               set_xen_guest_handle(kern_op.u.getmemlist.buffer,
26634 +                                    (void *)desc);
26635 +               break;
26636 +       }
26637 +       case XEN_DOMCTL_getpageframeinfo:
26638 +               break;
26639 +       case XEN_DOMCTL_getpageframeinfo2:
26640 +               ret = xencomm_create(
26641 +                       xen_guest_handle(kern_op.u.getpageframeinfo2.array),
26642 +                       kern_op.u.getpageframeinfo2.num,
26643 +                       &desc, GFP_KERNEL);
26644 +               set_xen_guest_handle(kern_op.u.getpageframeinfo2.array,
26645 +                                    (void *)desc);
26646 +               break;
26647 +       case XEN_DOMCTL_shadow_op:
26648 +               ret = xencomm_create(
26649 +                       xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap),
26650 +                       ROUND_DIV(kern_op.u.shadow_op.pages, 8),
26651 +                       &desc, GFP_KERNEL);
26652 +               set_xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap,
26653 +                                    (void *)desc);
26654 +               break;
26655 +       case XEN_DOMCTL_max_mem:
26656 +               break;
26657 +       case XEN_DOMCTL_setvcpucontext:
26658 +       case XEN_DOMCTL_getvcpucontext:
26659 +               ret = xencomm_create(
26660 +                       xen_guest_handle(kern_op.u.vcpucontext.ctxt),
26661 +                       sizeof(vcpu_guest_context_t),
26662 +                       &desc, GFP_KERNEL);
26663 +               set_xen_guest_handle(kern_op.u.vcpucontext.ctxt, (void *)desc);
26664 +               break;
26665 +       case XEN_DOMCTL_getvcpuinfo:
26666 +               break;
26667 +       case XEN_DOMCTL_setvcpuaffinity:
26668 +       case XEN_DOMCTL_getvcpuaffinity:
26669 +               ret = xencomm_create(
26670 +                       xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap),
26671 +                       ROUND_DIV(kern_op.u.vcpuaffinity.cpumap.nr_cpus, 8),
26672 +                       &desc, GFP_KERNEL);
26673 +               set_xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap,
26674 +                                    (void *)desc);
26675 +               break;
26676 +       case XEN_DOMCTL_max_vcpus:
26677 +       case XEN_DOMCTL_scheduler_op:
26678 +       case XEN_DOMCTL_setdomainhandle:
26679 +       case XEN_DOMCTL_setdebugging:
26680 +       case XEN_DOMCTL_irq_permission:
26681 +       case XEN_DOMCTL_iomem_permission:
26682 +       case XEN_DOMCTL_ioport_permission:
26683 +       case XEN_DOMCTL_hypercall_init:
26684 +       case XEN_DOMCTL_arch_setup:
26685 +       case XEN_DOMCTL_settimeoffset:
26686 +               break;
26687 +       default:
26688 +               printk("%s: unknown domctl cmd %d\n", __func__, kern_op.cmd);
26689 +               return -ENOSYS;
26690 +       }
26691 +
26692 +       if (ret) {
26693 +               /* error mapping the nested pointer */
26694 +               return ret;
26695 +       }
26696 +
26697 +       ret = xencomm_arch_hypercall_domctl (op_desc);
26698 +
26699 +       /* FIXME: should we restore the handle?  */
26700 +       if (copy_to_user(user_op, &kern_op, sizeof(xen_domctl_t)))
26701 +               ret = -EFAULT;
26702 +
26703 +       if (desc)
26704 +               xencomm_free(desc);
26705 +       return ret;
26706 +}
26707 +
26708 +static int
26709 +xencomm_privcmd_acm_op(privcmd_hypercall_t *hypercall)
26710 +{
26711 +       int cmd = hypercall->arg[0];
26712 +       void __user *arg = (void __user *)hypercall->arg[1];
26713 +       struct xencomm_handle *op_desc;
26714 +       struct xencomm_handle *desc = NULL;
26715 +       int ret;
26716 +
26717 +       switch (cmd) {
26718 +       case ACMOP_getssid:
26719 +       {
26720 +               struct acm_getssid kern_arg;
26721 +
26722 +               if (copy_from_user(&kern_arg, arg, sizeof (kern_arg)))
26723 +                       return -EFAULT;
26724 +
26725 +               op_desc = xencomm_create_inline(&kern_arg);
26726 +
26727 +               ret = xencomm_create(xen_guest_handle(kern_arg.ssidbuf),
26728 +                                    kern_arg.ssidbuf_size, &desc, GFP_KERNEL);
26729 +               if (ret)
26730 +                       return ret;
26731 +
26732 +               set_xen_guest_handle(kern_arg.ssidbuf, (void *)desc);
26733 +
26734 +               ret = xencomm_arch_hypercall_acm_op(cmd, op_desc);
26735 +
26736 +               xencomm_free(desc);
26737 +
26738 +               if (copy_to_user(arg, &kern_arg, sizeof (kern_arg)))
26739 +                       return -EFAULT;
26740 +
26741 +               return ret;
26742 +       }
26743 +       default:
26744 +               printk("%s: unknown acm_op cmd %d\n", __func__, cmd);
26745 +               return -ENOSYS;
26746 +       }
26747 +
26748 +       return ret;
26749 +}
26750 +
26751 +static int
26752 +xencomm_privcmd_memory_op(privcmd_hypercall_t *hypercall)
26753 +{
26754 +       const unsigned long cmd = hypercall->arg[0];
26755 +       int ret = 0;
26756 +
26757 +       switch (cmd) {
26758 +       case XENMEM_increase_reservation:
26759 +       case XENMEM_decrease_reservation:
26760 +       case XENMEM_populate_physmap:
26761 +       {
26762 +               xen_memory_reservation_t kern_op;
26763 +               xen_memory_reservation_t __user *user_op;
26764 +               struct xencomm_handle *desc = NULL;
26765 +               struct xencomm_handle *desc_op;
26766 +
26767 +               user_op = (xen_memory_reservation_t __user *)hypercall->arg[1];
26768 +               if (copy_from_user(&kern_op, user_op,
26769 +                                  sizeof(xen_memory_reservation_t)))
26770 +                       return -EFAULT;
26771 +               desc_op = xencomm_create_inline(&kern_op);
26772 +
26773 +               if (xen_guest_handle(kern_op.extent_start)) {
26774 +                       void * addr;
26775 +
26776 +                       addr = xen_guest_handle(kern_op.extent_start);
26777 +                       ret = xencomm_create
26778 +                               (addr,
26779 +                                kern_op.nr_extents *
26780 +                                sizeof(*xen_guest_handle
26781 +                                       (kern_op.extent_start)),
26782 +                                &desc, GFP_KERNEL);
26783 +                       if (ret)
26784 +                               return ret;
26785 +                       set_xen_guest_handle(kern_op.extent_start,
26786 +                                            (void *)desc);
26787 +               }
26788 +
26789 +               ret = xencomm_arch_hypercall_memory_op(cmd, desc_op);
26790 +
26791 +               if (desc)
26792 +                       xencomm_free(desc);
26793 +
26794 +               if (ret != 0)
26795 +                       return ret;
26796 +
26797 +               if (copy_to_user(user_op, &kern_op,
26798 +                                sizeof(xen_memory_reservation_t)))
26799 +                       return -EFAULT;
26800 +
26801 +               return ret;
26802 +       }
26803 +       case XENMEM_translate_gpfn_list:
26804 +       {
26805 +               xen_translate_gpfn_list_t kern_op;
26806 +               xen_translate_gpfn_list_t __user *user_op;
26807 +               struct xencomm_handle *desc_gpfn = NULL;
26808 +               struct xencomm_handle *desc_mfn = NULL;
26809 +               struct xencomm_handle *desc_op;
26810 +               void *addr;
26811 +
26812 +               user_op = (xen_translate_gpfn_list_t __user *)
26813 +                       hypercall->arg[1];
26814 +               if (copy_from_user(&kern_op, user_op,
26815 +                                  sizeof(xen_translate_gpfn_list_t)))
26816 +                       return -EFAULT;
26817 +               desc_op = xencomm_create_inline(&kern_op);
26818 +
26819 +               if (kern_op.nr_gpfns) {
26820 +                       /* gpfn_list.  */
26821 +                       addr = xen_guest_handle(kern_op.gpfn_list);
26822 +
26823 +                       ret = xencomm_create(addr, kern_op.nr_gpfns *
26824 +                                            sizeof(*xen_guest_handle
26825 +                                                   (kern_op.gpfn_list)),
26826 +                                            &desc_gpfn, GFP_KERNEL);
26827 +                       if (ret)
26828 +                               return ret;
26829 +                       set_xen_guest_handle(kern_op.gpfn_list,
26830 +                                            (void *)desc_gpfn);
26831 +
26832 +                       /* mfn_list.  */
26833 +                       addr = xen_guest_handle(kern_op.mfn_list);
26834 +
26835 +                       ret = xencomm_create(addr, kern_op.nr_gpfns *
26836 +                                            sizeof(*xen_guest_handle
26837 +                                                   (kern_op.mfn_list)),
26838 +                                            &desc_mfn, GFP_KERNEL);
26839 +                       if (ret)
26840 +                               return ret;
26841 +                       set_xen_guest_handle(kern_op.mfn_list,
26842 +                                            (void *)desc_mfn);
26843 +               }
26844 +
26845 +               ret = xencomm_arch_hypercall_memory_op(cmd, desc_op);
26846 +
26847 +               if (desc_gpfn)
26848 +                       xencomm_free(desc_gpfn);
26849 +
26850 +               if (desc_mfn)
26851 +                       xencomm_free(desc_mfn);
26852 +
26853 +               if (ret != 0)
26854 +                       return ret;
26855 +
26856 +               return ret;
26857 +       }
26858 +       default:
26859 +               printk("%s: unknown memory op %lu\n", __func__, cmd);
26860 +               ret = -ENOSYS;
26861 +       }
26862 +       return ret;
26863 +}
26864 +
26865 +static int
26866 +xencomm_privcmd_xen_version(privcmd_hypercall_t *hypercall)
26867 +{
26868 +       int cmd = hypercall->arg[0];
26869 +       void __user *arg = (void __user *)hypercall->arg[1];
26870 +       struct xencomm_handle *desc;
26871 +       size_t argsize;
26872 +       int rc;
26873 +
26874 +       switch (cmd) {
26875 +       case XENVER_version:
26876 +               /* do not actually pass an argument */
26877 +               return xencomm_arch_hypercall_xen_version(cmd, 0);
26878 +       case XENVER_extraversion:
26879 +               argsize = sizeof(xen_extraversion_t);
26880 +               break;
26881 +       case XENVER_compile_info:
26882 +               argsize = sizeof(xen_compile_info_t);
26883 +               break;
26884 +       case XENVER_capabilities:
26885 +               argsize = sizeof(xen_capabilities_info_t);
26886 +               break;
26887 +       case XENVER_changeset:
26888 +               argsize = sizeof(xen_changeset_info_t);
26889 +               break;
26890 +       case XENVER_platform_parameters:
26891 +               argsize = sizeof(xen_platform_parameters_t);
26892 +               break;
26893 +       case XENVER_pagesize:
26894 +               argsize = (arg == NULL) ? 0 : sizeof(void *);
26895 +               break;
26896 +       case XENVER_get_features:
26897 +               argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t);
26898 +               break;
26899 +
26900 +       default:
26901 +               printk("%s: unknown version op %d\n", __func__, cmd);
26902 +               return -ENOSYS;
26903 +       }
26904 +
26905 +       rc = xencomm_create(arg, argsize, &desc, GFP_KERNEL);
26906 +       if (rc)
26907 +               return rc;
26908 +
26909 +       rc = xencomm_arch_hypercall_xen_version(cmd, desc);
26910 +
26911 +       xencomm_free(desc);
26912 +
26913 +       return rc;
26914 +}
26915 +
26916 +static int
26917 +xencomm_privcmd_event_channel_op(privcmd_hypercall_t *hypercall)
26918 +{
26919 +       int cmd = hypercall->arg[0];
26920 +       struct xencomm_handle *desc;
26921 +       unsigned int argsize;
26922 +       int ret;
26923 +
26924 +       switch (cmd) {
26925 +       case EVTCHNOP_alloc_unbound:
26926 +               argsize = sizeof(evtchn_alloc_unbound_t);
26927 +               break;
26928 +
26929 +       case EVTCHNOP_status:
26930 +               argsize = sizeof(evtchn_status_t);
26931 +               break;
26932 +
26933 +       default:
26934 +               printk("%s: unknown EVTCHNOP %d\n", __func__, cmd);
26935 +               return -EINVAL;
26936 +       }
26937 +
26938 +       ret = xencomm_create((void *)hypercall->arg[1], argsize,
26939 +                            &desc, GFP_KERNEL);
26940 +       if (ret)
26941 +               return ret;
26942 +
26943 +       ret = xencomm_arch_hypercall_event_channel_op(cmd, desc);
26944 +
26945 +       xencomm_free(desc);
26946 +       return ret;
26947 +}
26948 +
26949 +static int
26950 +xencomm_privcmd_hvm_op(privcmd_hypercall_t *hypercall)
26951 +{
26952 +       int cmd = hypercall->arg[0];
26953 +       struct xencomm_handle *desc;
26954 +       unsigned int argsize;
26955 +       int ret;
26956 +
26957 +       switch (cmd) {
26958 +       case HVMOP_get_param:
26959 +       case HVMOP_set_param:
26960 +               argsize = sizeof(xen_hvm_param_t);
26961 +               break;
26962 +       case HVMOP_set_pci_intx_level:
26963 +               argsize = sizeof(xen_hvm_set_pci_intx_level_t);
26964 +               break;
26965 +       case HVMOP_set_isa_irq_level:
26966 +               argsize = sizeof(xen_hvm_set_isa_irq_level_t);
26967 +               break;
26968 +       case HVMOP_set_pci_link_route:
26969 +               argsize = sizeof(xen_hvm_set_pci_link_route_t);
26970 +               break;
26971 +
26972 +       default:
26973 +               printk("%s: unknown HVMOP %d\n", __func__, cmd);
26974 +               return -EINVAL;
26975 +       }
26976 +
26977 +       ret = xencomm_create((void *)hypercall->arg[1], argsize,
26978 +                            &desc, GFP_KERNEL);
26979 +       if (ret)
26980 +               return ret;
26981 +
26982 +       ret = xencomm_arch_hypercall_hvm_op(cmd, desc);
26983 +
26984 +       xencomm_free(desc);
26985 +       return ret;
26986 +}
26987 +
26988 +static int
26989 +xencomm_privcmd_sched_op(privcmd_hypercall_t *hypercall)
26990 +{
26991 +       int cmd = hypercall->arg[0];
26992 +       struct xencomm_handle *desc;
26993 +       unsigned int argsize;
26994 +       int ret;
26995 +
26996 +       switch (cmd) {
26997 +       case SCHEDOP_remote_shutdown:
26998 +               argsize = sizeof(sched_remote_shutdown_t);
26999 +               break;
27000 +       default:
27001 +               printk("%s: unknown SCHEDOP %d\n", __func__, cmd);
27002 +               return -EINVAL;
27003 +       }
27004 +
27005 +       ret = xencomm_create((void *)hypercall->arg[1], argsize,
27006 +                            &desc, GFP_KERNEL);
27007 +       if (ret)
27008 +               return ret;
27009 +
27010 +       ret = xencomm_arch_hypercall_sched_op(cmd, desc);
27011 +
27012 +       xencomm_free(desc);
27013 +       return ret;
27014 +}
27015 +
27016 +int
27017 +privcmd_hypercall(privcmd_hypercall_t *hypercall)
27018 +{
27019 +       switch (hypercall->op) {
27020 +       case __HYPERVISOR_dom0_op:
27021 +               return xencomm_privcmd_dom0_op(hypercall);
27022 +       case __HYPERVISOR_domctl:
27023 +               return xencomm_privcmd_domctl(hypercall);
27024 +       case __HYPERVISOR_sysctl:
27025 +               return xencomm_privcmd_sysctl(hypercall);
27026 +        case __HYPERVISOR_acm_op:
27027 +               return xencomm_privcmd_acm_op(hypercall);
27028 +       case __HYPERVISOR_xen_version:
27029 +               return xencomm_privcmd_xen_version(hypercall);
27030 +       case __HYPERVISOR_memory_op:
27031 +               return xencomm_privcmd_memory_op(hypercall);
27032 +       case __HYPERVISOR_event_channel_op:
27033 +               return xencomm_privcmd_event_channel_op(hypercall);
27034 +       case __HYPERVISOR_hvm_op:
27035 +               return xencomm_privcmd_hvm_op(hypercall);
27036 +       case __HYPERVISOR_sched_op:
27037 +               return xencomm_privcmd_sched_op(hypercall);
27038 +       default:
27039 +               printk("%s: unknown hcall (%ld)\n", __func__, hypercall->op);
27040 +               return -ENOSYS;
27041 +       }
27042 +}
27043 +
27044 diff -ruNp linux-2.6.19/arch/ia64/xen/xencomm.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/xencomm.c
27045 --- linux-2.6.19/arch/ia64/xen/xencomm.c        1970-01-01 00:00:00.000000000 +0000
27046 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xencomm.c      2007-02-02 19:10:21.000000000 +0000
27047 @@ -0,0 +1,263 @@
27048 +/*
27049 + * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation
27050 + *
27051 + * This program is free software; you can redistribute it and/or modify
27052 + * it under the terms of the GNU General Public License as published by
27053 + * the Free Software Foundation; either version 2 of the License, or
27054 + * (at your option) any later version.
27055 + * 
27056 + * This program is distributed in the hope that it will be useful,
27057 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
27058 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27059 + * GNU General Public License for more details.
27060 + * 
27061 + * You should have received a copy of the GNU General Public License
27062 + * along with this program; if not, write to the Free Software
27063 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
27064 + */
27065 +
27066 +#include <linux/gfp.h>
27067 +#include <linux/mm.h>
27068 +#include <xen/interface/xen.h>
27069 +#include <asm/page.h>
27070 +
27071 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
27072 +#include <xen/platform-compat.h>
27073 +#endif
27074 +
27075 +#include <asm/xen/xencomm.h>
27076 +
27077 +static int xencomm_debug = 0;
27078 +
27079 +static unsigned long kernel_start_pa;
27080 +
27081 +void
27082 +xencomm_init (void)
27083 +{
27084 +       kernel_start_pa = KERNEL_START - ia64_tpa(KERNEL_START);
27085 +}
27086 +
27087 +/* Translate virtual address to physical address.  */
27088 +unsigned long
27089 +xencomm_vaddr_to_paddr(unsigned long vaddr)
27090 +{
27091 +#ifndef CONFIG_VMX_GUEST
27092 +       struct page *page;
27093 +       struct vm_area_struct *vma;
27094 +#endif
27095 +
27096 +       if (vaddr == 0)
27097 +               return 0;
27098 +
27099 +#ifdef __ia64__
27100 +       if (REGION_NUMBER(vaddr) == 5) {
27101 +               pgd_t *pgd;
27102 +               pud_t *pud;
27103 +               pmd_t *pmd;
27104 +               pte_t *ptep;
27105 +
27106 +               /* On ia64, TASK_SIZE refers to current.  It is not initialized
27107 +                  during boot.
27108 +                  Furthermore the kernel is relocatable and __pa() doesn't
27109 +                  work on  addresses.  */
27110 +               if (vaddr >= KERNEL_START
27111 +                   && vaddr < (KERNEL_START + KERNEL_TR_PAGE_SIZE)) {
27112 +                       return vaddr - kernel_start_pa;
27113 +               }
27114 +
27115 +               /* In kernel area -- virtually mapped.  */
27116 +               pgd = pgd_offset_k(vaddr);
27117 +               if (pgd_none(*pgd) || pgd_bad(*pgd))
27118 +                       return ~0UL;
27119 +
27120 +               pud = pud_offset(pgd, vaddr);
27121 +               if (pud_none(*pud) || pud_bad(*pud))
27122 +                       return ~0UL;
27123 +
27124 +               pmd = pmd_offset(pud, vaddr);
27125 +               if (pmd_none(*pmd) || pmd_bad(*pmd))
27126 +                       return ~0UL;
27127 +
27128 +               ptep = pte_offset_kernel(pmd, vaddr);
27129 +               if (!ptep)
27130 +                       return ~0UL;
27131 +
27132 +               return (pte_val(*ptep) & _PFN_MASK) | (vaddr & ~PAGE_MASK);
27133 +       }
27134 +#endif
27135 +
27136 +       if (vaddr > TASK_SIZE) {
27137 +               /* kernel address */
27138 +               return __pa(vaddr);
27139 +       }
27140 +
27141 +
27142 +#ifdef CONFIG_VMX_GUEST
27143 +       /* No privcmd within vmx guest.  */
27144 +       return ~0UL;
27145 +#else
27146 +       /* XXX double-check (lack of) locking */
27147 +       vma = find_extend_vma(current->mm, vaddr);
27148 +       if (!vma)
27149 +               return ~0UL;
27150 +
27151 +       /* We assume the page is modified.  */
27152 +       page = follow_page(vma, vaddr, FOLL_WRITE | FOLL_TOUCH);
27153 +       if (!page)
27154 +               return ~0UL;
27155 +
27156 +       return (page_to_pfn(page) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
27157 +#endif
27158 +}
27159 +
27160 +static int
27161 +xencomm_init_desc(struct xencomm_desc *desc, void *buffer, unsigned long bytes)
27162 +{
27163 +       unsigned long recorded = 0;
27164 +       int i = 0;
27165 +
27166 +       BUG_ON((buffer == NULL) && (bytes > 0));
27167 +
27168 +       /* record the physical pages used */
27169 +       if (buffer == NULL)
27170 +               desc->nr_addrs = 0;
27171 +
27172 +       while ((recorded < bytes) && (i < desc->nr_addrs)) {
27173 +               unsigned long vaddr = (unsigned long)buffer + recorded;
27174 +               unsigned long paddr;
27175 +               int offset;
27176 +               int chunksz;
27177 +
27178 +               offset = vaddr % PAGE_SIZE; /* handle partial pages */
27179 +               chunksz = min(PAGE_SIZE - offset, bytes - recorded);
27180 +
27181 +               paddr = xencomm_vaddr_to_paddr(vaddr);
27182 +               if (paddr == ~0UL) {
27183 +                       printk("%s: couldn't translate vaddr %lx\n",
27184 +                              __func__, vaddr);
27185 +                       return -EINVAL;
27186 +               }
27187 +
27188 +               desc->address[i++] = paddr;
27189 +               recorded += chunksz;
27190 +       }
27191 +
27192 +       if (recorded < bytes) {
27193 +               printk("%s: could only translate %ld of %ld bytes\n",
27194 +                      __func__, recorded, bytes);
27195 +               return -ENOSPC;
27196 +       }
27197 +
27198 +       /* mark remaining addresses invalid (just for safety) */
27199 +       while (i < desc->nr_addrs)
27200 +               desc->address[i++] = XENCOMM_INVALID;
27201 +
27202 +       desc->magic = XENCOMM_MAGIC;
27203 +
27204 +       return 0;
27205 +}
27206 +
27207 +static struct xencomm_desc *
27208 +xencomm_alloc(gfp_t gfp_mask)
27209 +{
27210 +       struct xencomm_desc *desc;
27211 +
27212 +       desc = (struct xencomm_desc *)__get_free_page(gfp_mask);
27213 +       if (desc == NULL)
27214 +               panic("%s: page allocation failed\n", __func__);
27215 +
27216 +       desc->nr_addrs = (PAGE_SIZE - sizeof(struct xencomm_desc)) /
27217 +                        sizeof(*desc->address);
27218 +
27219 +       return desc;
27220 +}
27221 +
27222 +void
27223 +xencomm_free(struct xencomm_handle *desc)
27224 +{
27225 +       if (desc)
27226 +               free_page((unsigned long)__va(desc));
27227 +}
27228 +
27229 +int
27230 +xencomm_create(void *buffer, unsigned long bytes,
27231 +               struct xencomm_handle **ret, gfp_t gfp_mask)
27232 +{
27233 +       struct xencomm_desc *desc;
27234 +       struct xencomm_handle *handle;
27235 +       int rc;
27236 +
27237 +       if (xencomm_debug)
27238 +               printk("%s: %p[%ld]\n", __func__, buffer, bytes);
27239 +
27240 +       if (buffer == NULL || bytes == 0) {
27241 +               *ret = (struct xencomm_handle *)NULL;
27242 +               return 0;
27243 +       }
27244 +
27245 +       desc = xencomm_alloc(gfp_mask);
27246 +       if (!desc) {
27247 +               printk("%s failure\n", "xencomm_alloc");
27248 +               return -ENOMEM;
27249 +       }
27250 +       handle = (struct xencomm_handle *)__pa(desc);
27251 +
27252 +       rc = xencomm_init_desc(desc, buffer, bytes);
27253 +       if (rc) {
27254 +               printk("%s failure: %d\n", "xencomm_init_desc", rc);
27255 +               xencomm_free(handle);
27256 +               return rc;
27257 +       }
27258 +
27259 +       *ret = handle;
27260 +       return 0;
27261 +}
27262 +
27263 +/* "mini" routines, for stack-based communications: */
27264 +
27265 +static void *
27266 +xencomm_alloc_mini(struct xencomm_mini *area, int *nbr_area)
27267 +{
27268 +       unsigned long base;
27269 +       unsigned int pageoffset;
27270 +
27271 +       while (*nbr_area >= 0) {
27272 +               /* Allocate an area.  */
27273 +               (*nbr_area)--;
27274 +
27275 +               base = (unsigned long)(area + *nbr_area);
27276 +               pageoffset = base % PAGE_SIZE;
27277 +
27278 +               /* If the area does not cross a page, use it.  */
27279 +               if ((PAGE_SIZE - pageoffset) >= sizeof(struct xencomm_mini))
27280 +                       return &area[*nbr_area];
27281 +       }
27282 +       /* No more area.  */
27283 +       return NULL;
27284 +}
27285 +
27286 +int
27287 +xencomm_create_mini(struct xencomm_mini *area, int *nbr_area,
27288 +                    void *buffer, unsigned long bytes,
27289 +                    struct xencomm_handle **ret)
27290 +{
27291 +       struct xencomm_desc *desc;
27292 +       int rc;
27293 +       unsigned long res;
27294 +
27295 +       desc = xencomm_alloc_mini(area, nbr_area);
27296 +       if (!desc)
27297 +               return -ENOMEM;
27298 +       desc->nr_addrs = XENCOMM_MINI_ADDRS;
27299 +
27300 +       rc = xencomm_init_desc(desc, buffer, bytes);
27301 +       if (rc)
27302 +               return rc;
27303 +
27304 +       res = xencomm_vaddr_to_paddr((unsigned long)desc);
27305 +       if (res == ~0UL)
27306 +               return -EINVAL;
27307 +
27308 +       *ret = (struct xencomm_handle*)res;
27309 +       return 0;
27310 +}
27311 diff -ruNp linux-2.6.19/arch/ia64/xen/xenentry.S linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenentry.S
27312 --- linux-2.6.19/arch/ia64/xen/xenentry.S       1970-01-01 00:00:00.000000000 +0000
27313 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenentry.S     2007-02-02 19:10:21.000000000 +0000
27314 @@ -0,0 +1,908 @@
27315 +/*
27316 + * ia64/xen/entry.S
27317 + *
27318 + * Alternate kernel routines for Xen.  Heavily leveraged from
27319 + *   ia64/kernel/entry.S
27320 + *
27321 + * Copyright (C) 2005 Hewlett-Packard Co
27322 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
27323 + */
27324 +
27325 +#include <asm/asmmacro.h>
27326 +#include <asm/cache.h>
27327 +#include <asm/errno.h>
27328 +#include <asm/kregs.h>
27329 +#include <asm/asm-offsets.h>
27330 +#include <asm/pgtable.h>
27331 +#include <asm/percpu.h>
27332 +#include <asm/processor.h>
27333 +#include <asm/thread_info.h>
27334 +#include <asm/unistd.h>
27335 +
27336 +#ifdef CONFIG_XEN
27337 +#include "xenminstate.h"
27338 +#else
27339 +#include "minstate.h"
27340 +#endif
27341 +
27342 +/*
27343 + * prev_task <- ia64_switch_to(struct task_struct *next)
27344 + *     With Ingo's new scheduler, interrupts are disabled when this routine gets
27345 + *     called.  The code starting at .map relies on this.  The rest of the code
27346 + *     doesn't care about the interrupt masking status.
27347 + */
27348 +#ifdef CONFIG_XEN
27349 +GLOBAL_ENTRY(xen_switch_to)
27350 +       .prologue
27351 +       alloc r16=ar.pfs,1,0,0,0
27352 +       movl r22=running_on_xen;;
27353 +       ld4 r22=[r22];;
27354 +       cmp.eq p7,p0=r22,r0
27355 +(p7)   br.cond.sptk.many __ia64_switch_to;;
27356 +#else
27357 +GLOBAL_ENTRY(ia64_switch_to)
27358 +       .prologue
27359 +       alloc r16=ar.pfs,1,0,0,0
27360 +#endif
27361 +       DO_SAVE_SWITCH_STACK
27362 +       .body
27363 +
27364 +       adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
27365 +       movl r25=init_task
27366 +       mov r27=IA64_KR(CURRENT_STACK)
27367 +       adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
27368 +       dep r20=0,in0,61,3              // physical address of "next"
27369 +       ;;
27370 +       st8 [r22]=sp                    // save kernel stack pointer of old task
27371 +       shr.u r26=r20,IA64_GRANULE_SHIFT
27372 +       cmp.eq p7,p6=r25,in0
27373 +       ;;
27374 +#ifdef CONFIG_XEN
27375 +       movl r8=XSI_PSR_IC
27376 +       ;;
27377 +       st4 [r8]=r0     // force psr.ic off for hyperprivop(s)
27378 +       ;;
27379 +#endif
27380 +       /*
27381 +        * If we've already mapped this task's page, we can skip doing it again.
27382 +        */
27383 +(p6)   cmp.eq p7,p6=r26,r27
27384 +(p6)   br.cond.dpnt .map
27385 +       ;;
27386 +.done:
27387 +#ifdef CONFIG_XEN
27388 +       // psr.ic already off
27389 +       // update "current" application register
27390 +       mov r8=IA64_KR_CURRENT
27391 +       mov r9=in0;;
27392 +       XEN_HYPER_SET_KR
27393 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
27394 +       movl r27=XSI_PSR_IC
27395 +       mov r8=1
27396 +       ;;
27397 +       st4 [r27]=r8                    // psr.ic back on
27398 +#else
27399 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
27400 +       mov IA64_KR(CURRENT)=in0        // update "current" application register
27401 +#endif
27402 +       mov r8=r13                      // return pointer to previously running task
27403 +       mov r13=in0                     // set "current" pointer
27404 +       ;;
27405 +       DO_LOAD_SWITCH_STACK
27406 +
27407 +#ifdef CONFIG_SMP
27408 +       sync.i                          // ensure "fc"s done by this CPU are visible on other CPUs
27409 +#endif
27410 +       br.ret.sptk.many rp             // boogie on out in new context
27411 +
27412 +.map:
27413 +#ifdef CONFIG_XEN
27414 +       // psr.ic already off
27415 +#else
27416 +       rsm psr.ic                      // interrupts (psr.i) are already disabled here
27417 +#endif
27418 +       movl r25=PAGE_KERNEL
27419 +       ;;
27420 +       srlz.d
27421 +       or r23=r25,r20                  // construct PA | page properties
27422 +       mov r25=IA64_GRANULE_SHIFT<<2
27423 +       ;;
27424 +#ifdef CONFIG_XEN
27425 +       movl r8=XSI_ITIR
27426 +       ;;
27427 +       st8 [r8]=r25
27428 +       ;;
27429 +       movl r8=XSI_IFA
27430 +       ;;
27431 +       st8 [r8]=in0                     // VA of next task...
27432 +       ;;
27433 +       mov r25=IA64_TR_CURRENT_STACK
27434 +       // remember last page we mapped...
27435 +       mov r8=IA64_KR_CURRENT_STACK
27436 +       mov r9=r26;;
27437 +       XEN_HYPER_SET_KR;;
27438 +#else
27439 +       mov cr.itir=r25
27440 +       mov cr.ifa=in0                  // VA of next task...
27441 +       ;;
27442 +       mov r25=IA64_TR_CURRENT_STACK
27443 +       mov IA64_KR(CURRENT_STACK)=r26  // remember last page we mapped...
27444 +#endif
27445 +       ;;
27446 +       itr.d dtr[r25]=r23              // wire in new mapping...
27447 +#ifndef CONFIG_XEN
27448 +       ssm psr.ic                      // reenable the psr.ic bit
27449 +       ;;
27450 +       srlz.d
27451 +#endif
27452 +       br.cond.sptk .done
27453 +#ifdef CONFIG_XEN
27454 +END(xen_switch_to)
27455 +#else
27456 +END(ia64_switch_to)
27457 +#endif
27458 +
27459 +       /*
27460 +        * Invoke a system call, but do some tracing before and after the call.
27461 +        * We MUST preserve the current register frame throughout this routine
27462 +        * because some system calls (such as ia64_execve) directly
27463 +        * manipulate ar.pfs.
27464 +        */
27465 +#ifdef CONFIG_XEN
27466 +GLOBAL_ENTRY(xen_trace_syscall)
27467 +       PT_REGS_UNWIND_INFO(0)
27468 +       movl r16=running_on_xen;;
27469 +       ld4 r16=[r16];;
27470 +       cmp.eq p7,p0=r16,r0
27471 +(p7)   br.cond.sptk.many __ia64_trace_syscall;;
27472 +#else
27473 +GLOBAL_ENTRY(ia64_trace_syscall)
27474 +       PT_REGS_UNWIND_INFO(0)
27475 +#endif
27476 +       /*
27477 +        * We need to preserve the scratch registers f6-f11 in case the system
27478 +        * call is sigreturn.
27479 +        */
27480 +       adds r16=PT(F6)+16,sp
27481 +       adds r17=PT(F7)+16,sp
27482 +       ;;
27483 +       stf.spill [r16]=f6,32
27484 +       stf.spill [r17]=f7,32
27485 +       ;;
27486 +       stf.spill [r16]=f8,32
27487 +       stf.spill [r17]=f9,32
27488 +       ;;
27489 +       stf.spill [r16]=f10
27490 +       stf.spill [r17]=f11
27491 +       br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
27492 +       adds r16=PT(F6)+16,sp
27493 +       adds r17=PT(F7)+16,sp
27494 +       ;;
27495 +       ldf.fill f6=[r16],32
27496 +       ldf.fill f7=[r17],32
27497 +       ;;
27498 +       ldf.fill f8=[r16],32
27499 +       ldf.fill f9=[r17],32
27500 +       ;;
27501 +       ldf.fill f10=[r16]
27502 +       ldf.fill f11=[r17]
27503 +       // the syscall number may have changed, so re-load it and re-calculate the
27504 +       // syscall entry-point:
27505 +       adds r15=PT(R15)+16,sp                  // r15 = &pt_regs.r15 (syscall #)
27506 +       ;;
27507 +       ld8 r15=[r15]
27508 +       mov r3=NR_syscalls - 1
27509 +       ;;
27510 +       adds r15=-1024,r15
27511 +       movl r16=sys_call_table
27512 +       ;;
27513 +       shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
27514 +       cmp.leu p6,p7=r15,r3
27515 +       ;;
27516 +(p6)   ld8 r20=[r20]                           // load address of syscall entry point
27517 +(p7)   movl r20=sys_ni_syscall
27518 +       ;;
27519 +       mov b6=r20
27520 +       br.call.sptk.many rp=b6                 // do the syscall
27521 +.strace_check_retval:
27522 +       cmp.lt p6,p0=r8,r0                      // syscall failed?
27523 +       adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
27524 +       adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
27525 +       mov r10=0
27526 +(p6)   br.cond.sptk strace_error               // syscall failed ->
27527 +       ;;                                      // avoid RAW on r10
27528 +.strace_save_retval:
27529 +.mem.offset 0,0; st8.spill [r2]=r8             // store return value in slot for r8
27530 +.mem.offset 8,0; st8.spill [r3]=r10            // clear error indication in slot for r10
27531 +       br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
27532 +.ret3:
27533 +(pUStk)        cmp.eq.unc p6,p0=r0,r0                  // p6 <- pUStk
27534 +       br.cond.sptk .work_pending_syscall_end
27535 +
27536 +strace_error:
27537 +       ld8 r3=[r2]                             // load pt_regs.r8
27538 +       sub r9=0,r8                             // negate return value to get errno value
27539 +       ;;
27540 +       cmp.ne p6,p0=r3,r0                      // is pt_regs.r8!=0?
27541 +       adds r3=16,r2                           // r3=&pt_regs.r10
27542 +       ;;
27543 +(p6)   mov r10=-1
27544 +(p6)   mov r8=r9
27545 +       br.cond.sptk .strace_save_retval
27546 +#ifdef CONFIG_XEN
27547 +END(xen_trace_syscall)
27548 +#else
27549 +END(ia64_trace_syscall)
27550 +#endif
27551 +
27552 +#ifdef CONFIG_XEN
27553 +GLOBAL_ENTRY(xen_ret_from_clone)
27554 +       PT_REGS_UNWIND_INFO(0)
27555 +       movl r16=running_on_xen;;
27556 +       ld4 r16=[r16];;
27557 +       cmp.eq p7,p0=r16,r0
27558 +(p7)   br.cond.sptk.many __ia64_ret_from_clone;;
27559 +#else  
27560 +GLOBAL_ENTRY(ia64_ret_from_clone)
27561 +       PT_REGS_UNWIND_INFO(0)
27562 +#endif 
27563 +{      /*
27564 +        * Some versions of gas generate bad unwind info if the first instruction of a
27565 +        * procedure doesn't go into the first slot of a bundle.  This is a workaround.
27566 +        */
27567 +       nop.m 0
27568 +       nop.i 0
27569 +       /*
27570 +        * We need to call schedule_tail() to complete the scheduling process.
27571 +        * Called by ia64_switch_to() after do_fork()->copy_thread().  r8 contains the
27572 +        * address of the previously executing task.
27573 +        */
27574 +       br.call.sptk.many rp=ia64_invoke_schedule_tail
27575 +}
27576 +.ret8:
27577 +       adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
27578 +       ;;
27579 +       ld4 r2=[r2]
27580 +       ;;
27581 +       mov r8=0
27582 +       and r2=_TIF_SYSCALL_TRACEAUDIT,r2
27583 +       ;;
27584 +       cmp.ne p6,p0=r2,r0
27585 +(p6)   br.cond.spnt .strace_check_retval
27586 +       ;;                                      // added stop bits to prevent r8 dependency
27587 +#ifdef CONFIG_XEN
27588 +       br.cond.sptk ia64_ret_from_syscall
27589 +END(xen_ret_from_clone)
27590 +#else
27591 +END(ia64_ret_from_clone)
27592 +#endif         
27593 +/*
27594 + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
27595 + *     need to switch to bank 0 and doesn't restore the scratch registers.
27596 + *     To avoid leaking kernel bits, the scratch registers are set to
27597 + *     the following known-to-be-safe values:
27598 + *
27599 + *               r1: restored (global pointer)
27600 + *               r2: cleared
27601 + *               r3: 1 (when returning to user-level)
27602 + *           r8-r11: restored (syscall return value(s))
27603 + *              r12: restored (user-level stack pointer)
27604 + *              r13: restored (user-level thread pointer)
27605 + *              r14: set to __kernel_syscall_via_epc
27606 + *              r15: restored (syscall #)
27607 + *          r16-r17: cleared
27608 + *              r18: user-level b6
27609 + *              r19: cleared
27610 + *              r20: user-level ar.fpsr
27611 + *              r21: user-level b0
27612 + *              r22: cleared
27613 + *              r23: user-level ar.bspstore
27614 + *              r24: user-level ar.rnat
27615 + *              r25: user-level ar.unat
27616 + *              r26: user-level ar.pfs
27617 + *              r27: user-level ar.rsc
27618 + *              r28: user-level ip
27619 + *              r29: user-level psr
27620 + *              r30: user-level cfm
27621 + *              r31: user-level pr
27622 + *           f6-f11: cleared
27623 + *               pr: restored (user-level pr)
27624 + *               b0: restored (user-level rp)
27625 + *               b6: restored
27626 + *               b7: set to __kernel_syscall_via_epc
27627 + *          ar.unat: restored (user-level ar.unat)
27628 + *           ar.pfs: restored (user-level ar.pfs)
27629 + *           ar.rsc: restored (user-level ar.rsc)
27630 + *          ar.rnat: restored (user-level ar.rnat)
27631 + *      ar.bspstore: restored (user-level ar.bspstore)
27632 + *          ar.fpsr: restored (user-level ar.fpsr)
27633 + *           ar.ccv: cleared
27634 + *           ar.csd: cleared
27635 + *           ar.ssd: cleared
27636 + */
27637 +#ifdef CONFIG_XEN
27638 +GLOBAL_ENTRY(xen_leave_syscall)
27639 +       PT_REGS_UNWIND_INFO(0)
27640 +       movl r22=running_on_xen;;
27641 +       ld4 r22=[r22];;
27642 +       cmp.eq p7,p0=r22,r0
27643 +(p7)   br.cond.sptk.many __ia64_leave_syscall;;
27644 +#else
27645 +ENTRY(ia64_leave_syscall)
27646 +       PT_REGS_UNWIND_INFO(0)
27647 +#endif
27648 +       /*
27649 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
27650 +        * user- or fsys-mode, hence we disable interrupts early on.
27651 +        *
27652 +        * p6 controls whether current_thread_info()->flags needs to be check for
27653 +        * extra work.  We always check for extra work when returning to user-level.
27654 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
27655 +        * is 0.  After extra work processing has been completed, execution
27656 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
27657 +        * needs to be redone.
27658 +        */
27659 +#ifdef CONFIG_PREEMPT
27660 +       rsm psr.i                               // disable interrupts
27661 +       cmp.eq pLvSys,p0=r0,r0                  // pLvSys=1: leave from syscall
27662 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27663 +       ;;
27664 +       .pred.rel.mutex pUStk,pKStk
27665 +(pKStk) ld4 r21=[r20]                  // r21 <- preempt_count
27666 +(pUStk)        mov r21=0                       // r21 <- 0
27667 +       ;;
27668 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
27669 +#else /* !CONFIG_PREEMPT */
27670 +#ifdef CONFIG_XEN
27671 +       movl r2=XSI_PSR_I_ADDR
27672 +       mov r18=1
27673 +       ;;
27674 +       ld8 r2=[r2]
27675 +       ;;
27676 +(pUStk)        st1 [r2]=r18
27677 +#else
27678 +(pUStk)        rsm psr.i
27679 +#endif
27680 +       cmp.eq pLvSys,p0=r0,r0          // pLvSys=1: leave from syscall
27681 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
27682 +#endif
27683 +.work_processed_syscall:
27684 +       adds r2=PT(LOADRS)+16,r12
27685 +       adds r3=PT(AR_BSPSTORE)+16,r12
27686 +       adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
27687 +       ;;
27688 +(p6)   ld4 r31=[r18]                           // load current_thread_info()->flags
27689 +       ld8 r19=[r2],PT(B6)-PT(LOADRS)          // load ar.rsc value for "loadrs"
27690 +       nop.i 0
27691 +       ;;
27692 +       mov r16=ar.bsp                          // M2  get existing backing store pointer
27693 +       ld8 r18=[r2],PT(R9)-PT(B6)              // load b6
27694 +(p6)   and r15=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
27695 +       ;;
27696 +       ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)    // load ar.bspstore (may be garbage)
27697 +(p6)   cmp4.ne.unc p6,p0=r15, r0               // any special work pending?
27698 +(p6)   br.cond.spnt .work_pending_syscall
27699 +       ;;
27700 +       // start restoring the state saved on the kernel stack (struct pt_regs):
27701 +       ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
27702 +       ld8 r11=[r3],PT(CR_IIP)-PT(R11)
27703 +(pNonSys) break 0              //      bug check: we shouldn't be here if pNonSys is TRUE!
27704 +       ;;
27705 +       invala                  // M0|1 invalidate ALAT
27706 +#ifdef CONFIG_XEN
27707 +       movl r28=XSI_PSR_I_ADDR
27708 +       movl r29=XSI_PSR_IC
27709 +       ;;
27710 +       ld8 r28=[r28]
27711 +       mov r30=1
27712 +       ;;
27713 +       st1     [r28]=r30
27714 +       st4     [r29]=r0        // note: clears both vpsr.i and vpsr.ic!
27715 +       ;;
27716 +#else
27717 +       rsm psr.i | psr.ic      // M2   turn off interrupts and interruption collection
27718 +#endif
27719 +       cmp.eq p9,p0=r0,r0      // A    set p9 to indicate that we should restore cr.ifs
27720 +
27721 +       ld8 r29=[r2],16         // M0|1 load cr.ipsr
27722 +       ld8 r28=[r3],16         // M0|1 load cr.iip
27723 +       mov r22=r0              // A    clear r22
27724 +       ;;
27725 +       ld8 r30=[r2],16         // M0|1 load cr.ifs
27726 +       ld8 r25=[r3],16         // M0|1 load ar.unat
27727 +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
27728 +       ;;
27729 +       ld8 r26=[r2],PT(B0)-PT(AR_PFS)  // M0|1 load ar.pfs
27730 +(pKStk)        mov r22=psr                     // M2   read PSR now that interrupts are disabled
27731 +       nop 0
27732 +       ;;
27733 +       ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
27734 +       ld8 r27=[r3],PT(PR)-PT(AR_RSC)  // M0|1 load ar.rsc
27735 +       mov f6=f0                       // F    clear f6
27736 +       ;;
27737 +       ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)    // M0|1 load ar.rnat (may be garbage)
27738 +       ld8 r31=[r3],PT(R1)-PT(PR)              // M0|1 load predicates
27739 +       mov f7=f0                               // F    clear f7
27740 +       ;;
27741 +       ld8 r20=[r2],PT(R12)-PT(AR_FPSR)        // M0|1 load ar.fpsr
27742 +       ld8.fill r1=[r3],16                     // M0|1 load r1
27743 +(pUStk) mov r17=1                              // A
27744 +       ;;
27745 +(pUStk) st1 [r14]=r17                          // M2|3
27746 +       ld8.fill r13=[r3],16                    // M0|1
27747 +       mov f8=f0                               // F    clear f8
27748 +       ;;
27749 +       ld8.fill r12=[r2]                       // M0|1 restore r12 (sp)
27750 +       ld8.fill r15=[r3]                       // M0|1 restore r15
27751 +       mov b6=r18                              // I0   restore b6
27752 +
27753 +       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A
27754 +       mov f9=f0                                       // F    clear f9
27755 +(pKStk) br.cond.dpnt.many skip_rbs_switch              // B
27756 +
27757 +       srlz.d                          // M0   ensure interruption collection is off (for cover)
27758 +       shr.u r18=r19,16                // I0|1 get byte size of existing "dirty" partition
27759 +#ifdef CONFIG_XEN
27760 +       XEN_HYPER_COVER;
27761 +#else
27762 +       cover                           // B    add current frame into dirty partition & set cr.ifs
27763 +#endif
27764 +       ;;
27765 +(pUStk) ld4 r17=[r17]                  // M0|1 r17 = cpu_data->phys_stacked_size_p8
27766 +       mov r19=ar.bsp                  // M2   get new backing store pointer
27767 +       mov f10=f0                      // F    clear f10
27768 +
27769 +       nop.m 0
27770 +       movl r14=__kernel_syscall_via_epc // X
27771 +       ;;
27772 +       mov.m ar.csd=r0                 // M2   clear ar.csd
27773 +       mov.m ar.ccv=r0                 // M2   clear ar.ccv
27774 +       mov b7=r14                      // I0   clear b7 (hint with __kernel_syscall_via_epc)
27775 +
27776 +       mov.m ar.ssd=r0                 // M2   clear ar.ssd
27777 +       mov f11=f0                      // F    clear f11
27778 +       br.cond.sptk.many rbs_switch    // B
27779 +#ifdef CONFIG_XEN
27780 +END(xen_leave_syscall)
27781 +#else
27782 +END(ia64_leave_syscall)
27783 +#endif
27784 +
27785 +#ifdef CONFIG_XEN
27786 +GLOBAL_ENTRY(xen_leave_kernel)
27787 +       PT_REGS_UNWIND_INFO(0)
27788 +       movl r22=running_on_xen;;
27789 +       ld4 r22=[r22];;
27790 +       cmp.eq p7,p0=r22,r0
27791 +(p7)   br.cond.sptk.many __ia64_leave_kernel;;
27792 +#else
27793 +GLOBAL_ENTRY(ia64_leave_kernel)
27794 +       PT_REGS_UNWIND_INFO(0)
27795 +#endif
27796 +       /*
27797 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
27798 +        * user- or fsys-mode, hence we disable interrupts early on.
27799 +        *
27800 +        * p6 controls whether current_thread_info()->flags needs to be check for
27801 +        * extra work.  We always check for extra work when returning to user-level.
27802 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
27803 +        * is 0.  After extra work processing has been completed, execution
27804 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
27805 +        * needs to be redone.
27806 +        */
27807 +#ifdef CONFIG_PREEMPT
27808 +       rsm psr.i                               // disable interrupts
27809 +       cmp.eq p0,pLvSys=r0,r0                  // pLvSys=0: leave from kernel
27810 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27811 +       ;;
27812 +       .pred.rel.mutex pUStk,pKStk
27813 +(pKStk)        ld4 r21=[r20]                   // r21 <- preempt_count
27814 +(pUStk)        mov r21=0                       // r21 <- 0
27815 +       ;;
27816 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
27817 +#else
27818 +#ifdef CONFIG_XEN
27819 +(pUStk)        movl r17=XSI_PSR_I_ADDR
27820 +(pUStk)        mov r31=1
27821 +               ;;
27822 +(pUStk)        ld8 r17=[r17]
27823 +               ;;
27824 +(pUStk)        st1 [r17]=r31
27825 +       ;;
27826 +#else
27827 +(pUStk)        rsm psr.i
27828 +#endif
27829 +       cmp.eq p0,pLvSys=r0,r0          // pLvSys=0: leave from kernel
27830 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
27831 +#endif
27832 +.work_processed_kernel:
27833 +       adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
27834 +       ;;
27835 +(p6)   ld4 r31=[r17]                           // load current_thread_info()->flags
27836 +       adds r21=PT(PR)+16,r12
27837 +       ;;
27838 +
27839 +       lfetch [r21],PT(CR_IPSR)-PT(PR)
27840 +       adds r2=PT(B6)+16,r12
27841 +       adds r3=PT(R16)+16,r12
27842 +       ;;
27843 +       lfetch [r21]
27844 +       ld8 r28=[r2],8          // load b6
27845 +       adds r29=PT(R24)+16,r12
27846 +
27847 +       ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
27848 +       adds r30=PT(AR_CCV)+16,r12
27849 +(p6)   and r19=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
27850 +       ;;
27851 +       ld8.fill r24=[r29]
27852 +       ld8 r15=[r30]           // load ar.ccv
27853 +(p6)   cmp4.ne.unc p6,p0=r19, r0               // any special work pending?
27854 +       ;;
27855 +       ld8 r29=[r2],16         // load b7
27856 +       ld8 r30=[r3],16         // load ar.csd
27857 +(p6)   br.cond.spnt .work_pending
27858 +       ;;
27859 +       ld8 r31=[r2],16         // load ar.ssd
27860 +       ld8.fill r8=[r3],16
27861 +       ;;
27862 +       ld8.fill r9=[r2],16
27863 +       ld8.fill r10=[r3],PT(R17)-PT(R10)
27864 +       ;;
27865 +       ld8.fill r11=[r2],PT(R18)-PT(R11)
27866 +       ld8.fill r17=[r3],16
27867 +       ;;
27868 +       ld8.fill r18=[r2],16
27869 +       ld8.fill r19=[r3],16
27870 +       ;;
27871 +       ld8.fill r20=[r2],16
27872 +       ld8.fill r21=[r3],16
27873 +       mov ar.csd=r30
27874 +       mov ar.ssd=r31
27875 +       ;;
27876 +#ifdef CONFIG_XEN
27877 +       movl r23=XSI_PSR_I_ADDR
27878 +       movl r22=XSI_PSR_IC
27879 +       ;;
27880 +       ld8 r23=[r23]
27881 +       mov r25=1
27882 +       ;;
27883 +       st1 [r23]=r25
27884 +       st4 [r22]=r0            // note: clears both vpsr.i and vpsr.ic!
27885 +       ;;
27886 +#else
27887 +       rsm psr.i | psr.ic      // initiate turning off of interrupt and interruption collection
27888 +#endif
27889 +       invala                  // invalidate ALAT
27890 +       ;;
27891 +       ld8.fill r22=[r2],24
27892 +       ld8.fill r23=[r3],24
27893 +       mov b6=r28
27894 +       ;;
27895 +       ld8.fill r25=[r2],16
27896 +       ld8.fill r26=[r3],16
27897 +       mov b7=r29
27898 +       ;;
27899 +       ld8.fill r27=[r2],16
27900 +       ld8.fill r28=[r3],16
27901 +       ;;
27902 +       ld8.fill r29=[r2],16
27903 +       ld8.fill r30=[r3],24
27904 +       ;;
27905 +       ld8.fill r31=[r2],PT(F9)-PT(R31)
27906 +       adds r3=PT(F10)-PT(F6),r3
27907 +       ;;
27908 +       ldf.fill f9=[r2],PT(F6)-PT(F9)
27909 +       ldf.fill f10=[r3],PT(F8)-PT(F10)
27910 +       ;;
27911 +       ldf.fill f6=[r2],PT(F7)-PT(F6)
27912 +       ;;
27913 +       ldf.fill f7=[r2],PT(F11)-PT(F7)
27914 +       ldf.fill f8=[r3],32
27915 +       ;;
27916 +       srlz.d  // ensure that inter. collection is off (VHPT is don't care, since text is pinned)
27917 +       mov ar.ccv=r15
27918 +       ;;
27919 +       ldf.fill f11=[r2]
27920 +#ifdef CONFIG_XEN
27921 +       ;;
27922 +       // r16-r31 all now hold bank1 values
27923 +       movl r2=XSI_BANK1_R16
27924 +       movl r3=XSI_BANK1_R16+8
27925 +       ;;
27926 +.mem.offset 0,0; st8.spill [r2]=r16,16
27927 +.mem.offset 8,0; st8.spill [r3]=r17,16
27928 +       ;;
27929 +.mem.offset 0,0; st8.spill [r2]=r18,16
27930 +.mem.offset 8,0; st8.spill [r3]=r19,16
27931 +       ;;
27932 +.mem.offset 0,0; st8.spill [r2]=r20,16
27933 +.mem.offset 8,0; st8.spill [r3]=r21,16
27934 +       ;;
27935 +.mem.offset 0,0; st8.spill [r2]=r22,16
27936 +.mem.offset 8,0; st8.spill [r3]=r23,16
27937 +       ;;
27938 +.mem.offset 0,0; st8.spill [r2]=r24,16
27939 +.mem.offset 8,0; st8.spill [r3]=r25,16
27940 +       ;;
27941 +.mem.offset 0,0; st8.spill [r2]=r26,16
27942 +.mem.offset 8,0; st8.spill [r3]=r27,16
27943 +       ;;
27944 +.mem.offset 0,0; st8.spill [r2]=r28,16
27945 +.mem.offset 8,0; st8.spill [r3]=r29,16
27946 +       ;;
27947 +.mem.offset 0,0; st8.spill [r2]=r30,16
27948 +.mem.offset 8,0; st8.spill [r3]=r31,16
27949 +       ;;
27950 +       movl r2=XSI_BANKNUM;;
27951 +       st4 [r2]=r0;
27952 +#else
27953 +       bsw.0                   // switch back to bank 0 (no stop bit required beforehand...)
27954 +#endif
27955 +       ;;
27956 +(pUStk)        mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
27957 +       adds r16=PT(CR_IPSR)+16,r12
27958 +       adds r17=PT(CR_IIP)+16,r12
27959 +
27960 +(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
27961 +       nop.i 0
27962 +       nop.i 0
27963 +       ;;
27964 +       ld8 r29=[r16],16        // load cr.ipsr
27965 +       ld8 r28=[r17],16        // load cr.iip
27966 +       ;;
27967 +       ld8 r30=[r16],16        // load cr.ifs
27968 +       ld8 r25=[r17],16        // load ar.unat
27969 +       ;;
27970 +       ld8 r26=[r16],16        // load ar.pfs
27971 +       ld8 r27=[r17],16        // load ar.rsc
27972 +       cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
27973 +       ;;
27974 +       ld8 r24=[r16],16        // load ar.rnat (may be garbage)
27975 +       ld8 r23=[r17],16        // load ar.bspstore (may be garbage)
27976 +       ;;
27977 +       ld8 r31=[r16],16        // load predicates
27978 +       ld8 r21=[r17],16        // load b0
27979 +       ;;
27980 +       ld8 r19=[r16],16        // load ar.rsc value for "loadrs"
27981 +       ld8.fill r1=[r17],16    // load r1
27982 +       ;;
27983 +       ld8.fill r12=[r16],16
27984 +       ld8.fill r13=[r17],16
27985 +(pUStk)        adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
27986 +       ;;
27987 +       ld8 r20=[r16],16        // ar.fpsr
27988 +       ld8.fill r15=[r17],16
27989 +       ;;
27990 +       ld8.fill r14=[r16],16
27991 +       ld8.fill r2=[r17]
27992 +(pUStk)        mov r17=1
27993 +       ;;
27994 +       ld8.fill r3=[r16]
27995 +(pUStk)        st1 [r18]=r17           // restore current->thread.on_ustack
27996 +       shr.u r18=r19,16        // get byte size of existing "dirty" partition
27997 +       ;;
27998 +       mov r16=ar.bsp          // get existing backing store pointer
27999 +       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
28000 +       ;;
28001 +       ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
28002 +(pKStk)        br.cond.dpnt skip_rbs_switch
28003 +
28004 +       /*
28005 +        * Restore user backing store.
28006 +        *
28007 +        * NOTE: alloc, loadrs, and cover can't be predicated.
28008 +        */
28009 +(pNonSys) br.cond.dpnt dont_preserve_current_frame
28010 +
28011 +#ifdef CONFIG_XEN
28012 +       XEN_HYPER_COVER;
28013 +#else
28014 +       cover                           // add current frame into dirty partition and set cr.ifs
28015 +#endif
28016 +       ;;
28017 +       mov r19=ar.bsp                  // get new backing store pointer
28018 +rbs_switch:
28019 +       sub r16=r16,r18                 // krbs = old bsp - size of dirty partition
28020 +       cmp.ne p9,p0=r0,r0              // clear p9 to skip restore of cr.ifs
28021 +       ;;
28022 +       sub r19=r19,r16                 // calculate total byte size of dirty partition
28023 +       add r18=64,r18                  // don't force in0-in7 into memory...
28024 +       ;;
28025 +       shl r19=r19,16                  // shift size of dirty partition into loadrs position
28026 +       ;;
28027 +dont_preserve_current_frame:
28028 +       /*
28029 +        * To prevent leaking bits between the kernel and user-space,
28030 +        * we must clear the stacked registers in the "invalid" partition here.
28031 +        * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
28032 +        * 5 registers/cycle on McKinley).
28033 +        */
28034 +#      define pRecurse p6
28035 +#      define pReturn  p7
28036 +#ifdef CONFIG_ITANIUM
28037 +#      define Nregs    10
28038 +#else
28039 +#      define Nregs    14
28040 +#endif
28041 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
28042 +       shr.u loc1=r18,9                // RNaTslots <= floor(dirtySize / (64*8))
28043 +       sub r17=r17,r18                 // r17 = (physStackedSize + 8) - dirtySize
28044 +       ;;
28045 +       mov ar.rsc=r19                  // load ar.rsc to be used for "loadrs"
28046 +       shladd in0=loc1,3,r17
28047 +       mov in1=0
28048 +       ;;
28049 +       TEXT_ALIGN(32)
28050 +rse_clear_invalid:
28051 +#ifdef CONFIG_ITANIUM
28052 +       // cycle 0
28053 + { .mii
28054 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
28055 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
28056 +       add out0=-Nregs*8,in0
28057 +}{ .mfb
28058 +       add out1=1,in1                  // increment recursion count
28059 +       nop.f 0
28060 +       nop.b 0                         // can't do br.call here because of alloc (WAW on CFM)
28061 +       ;;
28062 +}{ .mfi        // cycle 1
28063 +       mov loc1=0
28064 +       nop.f 0
28065 +       mov loc2=0
28066 +}{ .mib
28067 +       mov loc3=0
28068 +       mov loc4=0
28069 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid
28070 +
28071 +}{ .mfi        // cycle 2
28072 +       mov loc5=0
28073 +       nop.f 0
28074 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
28075 +}{ .mib
28076 +       mov loc6=0
28077 +       mov loc7=0
28078 +(pReturn) br.ret.sptk.many b0
28079 +}
28080 +#else /* !CONFIG_ITANIUM */
28081 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
28082 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
28083 +       add out0=-Nregs*8,in0
28084 +       add out1=1,in1                  // increment recursion count
28085 +       mov loc1=0
28086 +       mov loc2=0
28087 +       ;;
28088 +       mov loc3=0
28089 +       mov loc4=0
28090 +       mov loc5=0
28091 +       mov loc6=0
28092 +       mov loc7=0
28093 +(pRecurse) br.call.dptk.few b0=rse_clear_invalid
28094 +       ;;
28095 +       mov loc8=0
28096 +       mov loc9=0
28097 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
28098 +       mov loc10=0
28099 +       mov loc11=0
28100 +(pReturn) br.ret.dptk.many b0
28101 +#endif /* !CONFIG_ITANIUM */
28102 +#      undef pRecurse
28103 +#      undef pReturn
28104 +       ;;
28105 +       alloc r17=ar.pfs,0,0,0,0        // drop current register frame
28106 +       ;;
28107 +       loadrs
28108 +       ;;
28109 +skip_rbs_switch:
28110 +       mov ar.unat=r25         // M2
28111 +(pKStk)        extr.u r22=r22,21,1     // I0 extract current value of psr.pp from r22
28112 +(pLvSys)mov r19=r0             // A  clear r19 for leave_syscall, no-op otherwise
28113 +       ;;
28114 +(pUStk)        mov ar.bspstore=r23     // M2
28115 +(pKStk)        dep r29=r22,r29,21,1    // I0 update ipsr.pp with psr.pp
28116 +(pLvSys)mov r16=r0             // A  clear r16 for leave_syscall, no-op otherwise
28117 +       ;;
28118 +#ifdef CONFIG_XEN
28119 +       movl r25=XSI_IPSR
28120 +       ;;
28121 +       st8[r25]=r29,XSI_IFS_OFS-XSI_IPSR_OFS
28122 +       ;;
28123 +#else
28124 +       mov cr.ipsr=r29         // M2
28125 +#endif
28126 +       mov ar.pfs=r26          // I0
28127 +(pLvSys)mov r17=r0             // A  clear r17 for leave_syscall, no-op otherwise
28128 +
28129 +#ifdef CONFIG_XEN
28130 +(p9)   st8 [r25]=r30
28131 +       ;;
28132 +       adds r25=XSI_IIP_OFS-XSI_IFS_OFS,r25
28133 +       ;;
28134 +#else
28135 +(p9)   mov cr.ifs=r30          // M2
28136 +#endif
28137 +       mov b0=r21              // I0
28138 +(pLvSys)mov r18=r0             // A  clear r18 for leave_syscall, no-op otherwise
28139 +
28140 +       mov ar.fpsr=r20         // M2
28141 +#ifdef CONFIG_XEN
28142 +       st8     [r25]=r28
28143 +#else
28144 +       mov cr.iip=r28          // M2
28145 +#endif
28146 +       nop 0
28147 +       ;;
28148 +(pUStk)        mov ar.rnat=r24         // M2 must happen with RSE in lazy mode
28149 +       nop 0
28150 +(pLvSys)mov r2=r0
28151 +
28152 +       mov ar.rsc=r27          // M2
28153 +       mov pr=r31,-1           // I0
28154 +#ifdef CONFIG_XEN
28155 +       ;;
28156 +       XEN_HYPER_RFI;
28157 +#else
28158 +       rfi                     // B
28159 +#endif
28160 +
28161 +       /*
28162 +        * On entry:
28163 +        *      r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
28164 +        *      r31 = current->thread_info->flags
28165 +        * On exit:
28166 +        *      p6 = TRUE if work-pending-check needs to be redone
28167 +        */
28168 +.work_pending_syscall:
28169 +       add r2=-8,r2
28170 +       add r3=-8,r3
28171 +       ;;
28172 +       st8 [r2]=r8
28173 +       st8 [r3]=r10
28174 +.work_pending:
28175 +       tbit.z p6,p0=r31,TIF_NEED_RESCHED               // current_thread_info()->need_resched==0?
28176 +(p6)   br.cond.sptk.few .notify
28177 +#ifdef CONFIG_PREEMPT
28178 +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
28179 +       ;;
28180 +(pKStk) st4 [r20]=r21
28181 +       ssm psr.i               // enable interrupts
28182 +#endif
28183 +       br.call.spnt.many rp=schedule
28184 +.ret9: cmp.eq p6,p0=r0,r0                              // p6 <- 1
28185 +#ifdef CONFIG_XEN
28186 +       movl r2=XSI_PSR_I_ADDR
28187 +       mov r20=1
28188 +       ;;
28189 +       ld8 r2=[r2]
28190 +       ;;
28191 +       st1 [r2]=r20
28192 +#else
28193 +       rsm psr.i               // disable interrupts
28194 +#endif
28195 +       ;;
28196 +#ifdef CONFIG_PREEMPT
28197 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
28198 +       ;;
28199 +(pKStk)        st4 [r20]=r0            // preempt_count() <- 0
28200 +#endif
28201 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
28202 +       br.cond.sptk.many .work_processed_kernel        // re-check
28203 +
28204 +.notify:
28205 +(pUStk)        br.call.spnt.many rp=notify_resume_user
28206 +.ret10:        cmp.ne p6,p0=r0,r0                              // p6 <- 0
28207 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
28208 +       br.cond.sptk.many .work_processed_kernel        // don't re-check
28209 +
28210 +.work_pending_syscall_end:
28211 +       adds r2=PT(R8)+16,r12
28212 +       adds r3=PT(R10)+16,r12
28213 +       ;;
28214 +       ld8 r8=[r2]
28215 +       ld8 r10=[r3]
28216 +       br.cond.sptk.many .work_processed_syscall       // re-check
28217 +
28218 +#ifdef CONFIG_XEN
28219 +END(xen_leave_kernel)
28220 +#else
28221 +END(ia64_leave_kernel)
28222 +#endif
28223 diff -ruNp linux-2.6.19/arch/ia64/xen/xenhpski.c linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenhpski.c
28224 --- linux-2.6.19/arch/ia64/xen/xenhpski.c       1970-01-01 00:00:00.000000000 +0000
28225 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenhpski.c     2007-02-02 19:10:21.000000000 +0000
28226 @@ -0,0 +1,19 @@
28227 +
28228 +extern unsigned long xen_get_cpuid(int);
28229 +
28230 +int
28231 +running_on_sim(void)
28232 +{
28233 +       int i;
28234 +       long cpuid[6];
28235 +
28236 +       for (i = 0; i < 5; ++i)
28237 +               cpuid[i] = xen_get_cpuid(i);
28238 +       if ((cpuid[0] & 0xff) != 'H') return 0;
28239 +       if ((cpuid[3] & 0xff) != 0x4) return 0;
28240 +       if (((cpuid[3] >> 8) & 0xff) != 0x0) return 0;
28241 +       if (((cpuid[3] >> 16) & 0xff) != 0x0) return 0;
28242 +       if (((cpuid[3] >> 24) & 0x7) != 0x7) return 0;
28243 +       return 1;
28244 +}
28245 +
28246 diff -ruNp linux-2.6.19/arch/ia64/xen/xenivt.S linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenivt.S
28247 --- linux-2.6.19/arch/ia64/xen/xenivt.S 1970-01-01 00:00:00.000000000 +0000
28248 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenivt.S       2007-02-02 19:10:21.000000000 +0000
28249 @@ -0,0 +1,2178 @@
28250 +/*
28251 + * arch/ia64/xen/ivt.S
28252 + *
28253 + * Copyright (C) 2005 Hewlett-Packard Co
28254 + *     Dan Magenheimer <dan.magenheimer@hp.com>
28255 + */
28256 +/*
28257 + * This file defines the interruption vector table used by the CPU.
28258 + * It does not include one entry per possible cause of interruption.
28259 + *
28260 + * The first 20 entries of the table contain 64 bundles each while the
28261 + * remaining 48 entries contain only 16 bundles each.
28262 + *
28263 + * The 64 bundles are used to allow inlining the whole handler for critical
28264 + * interruptions like TLB misses.
28265 + *
28266 + *  For each entry, the comment is as follows:
28267 + *
28268 + *             // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
28269 + *  entry offset ----/     /         /                  /          /
28270 + *  entry number ---------/         /                  /          /
28271 + *  size of the entry -------------/                  /          /
28272 + *  vector name -------------------------------------/          /
28273 + *  interruptions triggering this vector ----------------------/
28274 + *
28275 + * The table is 32KB in size and must be aligned on 32KB boundary.
28276 + * (The CPU ignores the 15 lower bits of the address)
28277 + *
28278 + * Table is based upon EAS2.6 (Oct 1999)
28279 + */
28280 +
28281 +#include <asm/asmmacro.h>
28282 +#include <asm/break.h>
28283 +#include <asm/ia32.h>
28284 +#include <asm/kregs.h>
28285 +#include <asm/asm-offsets.h>
28286 +#include <asm/pgtable.h>
28287 +#include <asm/processor.h>
28288 +#include <asm/ptrace.h>
28289 +#include <asm/system.h>
28290 +#include <asm/thread_info.h>
28291 +#include <asm/unistd.h>
28292 +#include <asm/errno.h>
28293 +
28294 +#ifdef CONFIG_XEN
28295 +#define ia64_ivt xen_ivt
28296 +#endif
28297 +
28298 +#if 1
28299 +# define PSR_DEFAULT_BITS      psr.ac
28300 +#else
28301 +# define PSR_DEFAULT_BITS      0
28302 +#endif
28303 +
28304 +#if 0
28305 +  /*
28306 +   * This lets you track the last eight faults that occurred on the CPU.  Make sure ar.k2 isn't
28307 +   * needed for something else before enabling this...
28308 +   */
28309 +# define DBG_FAULT(i)  mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
28310 +#else
28311 +# define DBG_FAULT(i)
28312 +#endif
28313 +
28314 +#define MINSTATE_VIRT  /* needed by minstate.h */
28315 +#include "xenminstate.h"
28316 +
28317 +#define FAULT(n)                                                                       \
28318 +       mov r31=pr;                                                                     \
28319 +       mov r19=n;;                     /* prepare to save predicates */                \
28320 +       br.sptk.many dispatch_to_fault_handler
28321 +
28322 +       .section .text.ivt,"ax"
28323 +
28324 +       .align 32768    // align on 32KB boundary
28325 +       .global ia64_ivt
28326 +ia64_ivt:
28327 +/////////////////////////////////////////////////////////////////////////////////////////
28328 +// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
28329 +ENTRY(vhpt_miss)
28330 +       DBG_FAULT(0)
28331 +       /*
28332 +        * The VHPT vector is invoked when the TLB entry for the virtual page table
28333 +        * is missing.  This happens only as a result of a previous
28334 +        * (the "original") TLB miss, which may either be caused by an instruction
28335 +        * fetch or a data access (or non-access).
28336 +        *
28337 +        * What we do here is normal TLB miss handing for the _original_ miss,
28338 +        * followed by inserting the TLB entry for the virtual page table page
28339 +        * that the VHPT walker was attempting to access.  The latter gets
28340 +        * inserted as long as page table entry above pte level have valid
28341 +        * mappings for the faulting address.  The TLB entry for the original
28342 +        * miss gets inserted only if the pte entry indicates that the page is
28343 +        * present.
28344 +        *
28345 +        * do_page_fault gets invoked in the following cases:
28346 +        *      - the faulting virtual address uses unimplemented address bits
28347 +        *      - the faulting virtual address has no valid page table mapping
28348 +        */
28349 +#ifdef CONFIG_XEN
28350 +       movl r16=XSI_IFA
28351 +       ;;
28352 +       ld8 r16=[r16]
28353 +#ifdef CONFIG_HUGETLB_PAGE
28354 +       movl r18=PAGE_SHIFT
28355 +       movl r25=XSI_ITIR
28356 +       ;;
28357 +       ld8 r25=[r25]
28358 +#endif
28359 +       ;;
28360 +#else
28361 +       mov r16=cr.ifa                          // get address that caused the TLB miss
28362 +#ifdef CONFIG_HUGETLB_PAGE
28363 +       movl r18=PAGE_SHIFT
28364 +       mov r25=cr.itir
28365 +#endif
28366 +#endif
28367 +       ;;
28368 +#ifdef CONFIG_XEN
28369 +       XEN_HYPER_RSM_PSR_DT;
28370 +#else
28371 +       rsm psr.dt                              // use physical addressing for data
28372 +#endif
28373 +       mov r31=pr                              // save the predicate registers
28374 +       mov r19=IA64_KR(PT_BASE)                // get page table base address
28375 +       shl r21=r16,3                           // shift bit 60 into sign bit
28376 +       shr.u r17=r16,61                        // get the region number into r17
28377 +       ;;
28378 +       shr.u r22=r21,3
28379 +#ifdef CONFIG_HUGETLB_PAGE
28380 +       extr.u r26=r25,2,6
28381 +       ;;
28382 +       cmp.ne p8,p0=r18,r26
28383 +       sub r27=r26,r18
28384 +       ;;
28385 +(p8)   dep r25=r18,r25,2,6
28386 +(p8)   shr r22=r22,r27
28387 +#endif
28388 +       ;;
28389 +       cmp.eq p6,p7=5,r17                      // is IFA pointing into to region 5?
28390 +       shr.u r18=r22,PGDIR_SHIFT               // get bottom portion of pgd index bit
28391 +       ;;
28392 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
28393 +
28394 +       srlz.d
28395 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
28396 +
28397 +       .pred.rel "mutex", p6, p7
28398 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
28399 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
28400 +       ;;
28401 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=pgd_offset for region 5
28402 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=pgd_offset for region[0-4]
28403 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
28404 +#ifdef CONFIG_PGTABLE_4
28405 +       shr.u r28=r22,PUD_SHIFT                 // shift pud index into position
28406 +#else
28407 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
28408 +#endif
28409 +       ;;
28410 +       ld8 r17=[r17]                           // get *pgd (may be 0)
28411 +       ;;
28412 +(p7)   cmp.eq p6,p7=r17,r0                     // was pgd_present(*pgd) == NULL?
28413 +#ifdef CONFIG_PGTABLE_4
28414 +       dep r28=r28,r17,3,(PAGE_SHIFT-3)        // r28=pud_offset(pgd,addr)
28415 +       ;;
28416 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
28417 +(p7)   ld8 r29=[r28]                           // get *pud (may be 0)
28418 +       ;;
28419 +(p7)   cmp.eq.or.andcm p6,p7=r29,r0            // was pud_present(*pud) == NULL?
28420 +       dep r17=r18,r29,3,(PAGE_SHIFT-3)        // r17=pmd_offset(pud,addr)
28421 +#else
28422 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // r17=pmd_offset(pgd,addr)
28423 +#endif
28424 +       ;;
28425 +(p7)   ld8 r20=[r17]                           // get *pmd (may be 0)
28426 +       shr.u r19=r22,PAGE_SHIFT                // shift pte index into position
28427 +       ;;
28428 +(p7)   cmp.eq.or.andcm p6,p7=r20,r0            // was pmd_present(*pmd) == NULL?
28429 +       dep r21=r19,r20,3,(PAGE_SHIFT-3)        // r21=pte_offset(pmd,addr)
28430 +       ;;
28431 +(p7)   ld8 r18=[r21]                           // read *pte
28432 +#ifdef CONFIG_XEN
28433 +       movl r19=XSI_ISR
28434 +       ;;
28435 +       ld8 r19=[r19]
28436 +#else
28437 +       mov r19=cr.isr                          // cr.isr bit 32 tells us if this is an insn miss
28438 +#endif
28439 +       ;;
28440 +(p7)   tbit.z p6,p7=r18,_PAGE_P_BIT            // page present bit cleared?
28441 +#ifdef CONFIG_XEN
28442 +       movl r22=XSI_IHA
28443 +       ;;
28444 +       ld8 r22=[r22]
28445 +#else
28446 +       mov r22=cr.iha                          // get the VHPT address that caused the TLB miss
28447 +#endif
28448 +       ;;                                      // avoid RAW on p7
28449 +(p7)   tbit.nz.unc p10,p11=r19,32              // is it an instruction TLB miss?
28450 +       dep r23=0,r20,0,PAGE_SHIFT              // clear low bits to get page address
28451 +       ;;
28452 +#ifdef CONFIG_XEN
28453 +       mov r24=r8
28454 +       mov r8=r18
28455 +       ;;
28456 +(p10)  XEN_HYPER_ITC_I
28457 +       ;;
28458 +(p11)  XEN_HYPER_ITC_D
28459 +       ;;
28460 +       mov r8=r24
28461 +       ;;
28462 +#else
28463 +(p10)  itc.i r18                               // insert the instruction TLB entry
28464 +(p11)  itc.d r18                               // insert the data TLB entry
28465 +#endif
28466 +(p6)   br.cond.spnt.many page_fault            // handle bad address/page not present (page fault)
28467 +#ifdef CONFIG_XEN
28468 +       movl r24=XSI_IFA
28469 +       ;;
28470 +       st8 [r24]=r22
28471 +       ;;
28472 +#else
28473 +       mov cr.ifa=r22
28474 +#endif
28475 +
28476 +#ifdef CONFIG_HUGETLB_PAGE
28477 +(p8)   mov cr.itir=r25                         // change to default page-size for VHPT
28478 +#endif
28479 +
28480 +       /*
28481 +        * Now compute and insert the TLB entry for the virtual page table.  We never
28482 +        * execute in a page table page so there is no need to set the exception deferral
28483 +        * bit.
28484 +        */
28485 +       adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
28486 +       ;;
28487 +#ifdef CONFIG_XEN
28488 +(p7)   mov r25=r8
28489 +(p7)   mov r8=r24
28490 +       ;;
28491 +(p7)   XEN_HYPER_ITC_D
28492 +       ;;
28493 +(p7)   mov r8=r25
28494 +       ;;
28495 +#else
28496 +(p7)   itc.d r24
28497 +#endif
28498 +       ;;
28499 +#ifdef CONFIG_SMP
28500 +       /*
28501 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
28502 +        * cannot possibly affect the following loads:
28503 +        */
28504 +       dv_serialize_data
28505 +
28506 +       /*
28507 +        * Re-check pagetable entry.  If they changed, we may have received a ptc.g
28508 +        * between reading the pagetable and the "itc".  If so, flush the entry we
28509 +        * inserted and retry.  At this point, we have:
28510 +        *
28511 +        * r28 = equivalent of pud_offset(pgd, ifa)
28512 +        * r17 = equivalent of pmd_offset(pud, ifa)
28513 +        * r21 = equivalent of pte_offset(pmd, ifa)
28514 +        *
28515 +        * r29 = *pud
28516 +        * r20 = *pmd
28517 +        * r18 = *pte
28518 +        */
28519 +       ld8 r25=[r21]                           // read *pte again
28520 +       ld8 r26=[r17]                           // read *pmd again
28521 +#ifdef CONFIG_PGTABLE_4
28522 +       ld8 r19=[r28]                           // read *pud again
28523 +#endif
28524 +       cmp.ne p6,p7=r0,r0
28525 +       ;;
28526 +       cmp.ne.or.andcm p6,p7=r26,r20           // did *pmd change
28527 +#ifdef CONFIG_PGTABLE_4
28528 +       cmp.ne.or.andcm p6,p7=r19,r29           // did *pud change
28529 +#endif
28530 +       mov r27=PAGE_SHIFT<<2
28531 +       ;;
28532 +(p6)   ptc.l r22,r27                           // purge PTE page translation
28533 +(p7)   cmp.ne.or.andcm p6,p7=r25,r18           // did *pte change
28534 +       ;;
28535 +(p6)   ptc.l r16,r27                           // purge translation
28536 +#endif
28537 +
28538 +       mov pr=r31,-1                           // restore predicate registers
28539 +#ifdef CONFIG_XEN
28540 +       XEN_HYPER_RFI
28541 +       dv_serialize_data
28542 +#else
28543 +       rfi
28544 +#endif
28545 +END(vhpt_miss)
28546 +
28547 +       .org ia64_ivt+0x400
28548 +/////////////////////////////////////////////////////////////////////////////////////////
28549 +// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
28550 +ENTRY(itlb_miss)
28551 +       DBG_FAULT(1)
28552 +       /*
28553 +        * The ITLB handler accesses the PTE via the virtually mapped linear
28554 +        * page table.  If a nested TLB miss occurs, we switch into physical
28555 +        * mode, walk the page table, and then re-execute the PTE read and
28556 +        * go on normally after that.
28557 +        */
28558 +#ifdef CONFIG_XEN
28559 +       movl r16=XSI_IFA
28560 +       ;;
28561 +       ld8 r16=[r16]
28562 +#else
28563 +       mov r16=cr.ifa                          // get virtual address
28564 +#endif
28565 +       mov r29=b0                              // save b0
28566 +       mov r31=pr                              // save predicates
28567 +.itlb_fault:
28568 +#ifdef CONFIG_XEN
28569 +       movl r17=XSI_IHA
28570 +       ;;
28571 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
28572 +#else
28573 +       mov r17=cr.iha                          // get virtual address of PTE
28574 +#endif
28575 +       movl r30=1f                             // load nested fault continuation point
28576 +       ;;
28577 +1:     ld8 r18=[r17]                           // read *pte
28578 +       ;;
28579 +       mov b0=r29
28580 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
28581 +(p6)   br.cond.spnt page_fault
28582 +       ;;
28583 +#ifdef CONFIG_XEN
28584 +       mov r19=r8
28585 +       mov r8=r18
28586 +       ;;
28587 +       XEN_HYPER_ITC_I
28588 +       ;;
28589 +       mov r8=r19
28590 +#else
28591 +       itc.i r18
28592 +#endif
28593 +       ;;
28594 +#ifdef CONFIG_SMP
28595 +       /*
28596 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
28597 +        * cannot possibly affect the following loads:
28598 +        */
28599 +       dv_serialize_data
28600 +
28601 +       ld8 r19=[r17]                           // read *pte again and see if same
28602 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
28603 +       ;;
28604 +       cmp.ne p7,p0=r18,r19
28605 +       ;;
28606 +(p7)   ptc.l r16,r20
28607 +#endif
28608 +       mov pr=r31,-1
28609 +#ifdef CONFIG_XEN
28610 +       XEN_HYPER_RFI
28611 +       dv_serialize_data
28612 +#else
28613 +       rfi
28614 +#endif
28615 +END(itlb_miss)
28616 +
28617 +       .org ia64_ivt+0x0800
28618 +/////////////////////////////////////////////////////////////////////////////////////////
28619 +// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
28620 +ENTRY(dtlb_miss)
28621 +       DBG_FAULT(2)
28622 +       /*
28623 +        * The DTLB handler accesses the PTE via the virtually mapped linear
28624 +        * page table.  If a nested TLB miss occurs, we switch into physical
28625 +        * mode, walk the page table, and then re-execute the PTE read and
28626 +        * go on normally after that.
28627 +        */
28628 +#ifdef CONFIG_XEN
28629 +       movl r16=XSI_IFA
28630 +       ;;
28631 +       ld8 r16=[r16]
28632 +#else
28633 +       mov r16=cr.ifa                          // get virtual address
28634 +#endif
28635 +       mov r29=b0                              // save b0
28636 +       mov r31=pr                              // save predicates
28637 +dtlb_fault:
28638 +#ifdef CONFIG_XEN
28639 +       movl r17=XSI_IHA
28640 +       ;;
28641 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
28642 +#else
28643 +       mov r17=cr.iha                          // get virtual address of PTE
28644 +#endif
28645 +       movl r30=1f                             // load nested fault continuation point
28646 +       ;;
28647 +1:     ld8 r18=[r17]                           // read *pte
28648 +       ;;
28649 +       mov b0=r29
28650 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
28651 +(p6)   br.cond.spnt page_fault
28652 +       ;;
28653 +#ifdef CONFIG_XEN
28654 +       mov r19=r8
28655 +       mov r8=r18
28656 +       ;;
28657 +       XEN_HYPER_ITC_D
28658 +       ;;
28659 +       mov r8=r19
28660 +       ;;
28661 +#else
28662 +       itc.d r18
28663 +#endif
28664 +       ;;
28665 +#ifdef CONFIG_SMP
28666 +       /*
28667 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
28668 +        * cannot possibly affect the following loads:
28669 +        */
28670 +       dv_serialize_data
28671 +
28672 +       ld8 r19=[r17]                           // read *pte again and see if same
28673 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
28674 +       ;;
28675 +       cmp.ne p7,p0=r18,r19
28676 +       ;;
28677 +(p7)   ptc.l r16,r20
28678 +#endif
28679 +       mov pr=r31,-1
28680 +#ifdef CONFIG_XEN
28681 +       XEN_HYPER_RFI
28682 +       dv_serialize_data
28683 +#else
28684 +       rfi
28685 +#endif
28686 +END(dtlb_miss)
28687 +
28688 +       .org ia64_ivt+0x0c00
28689 +/////////////////////////////////////////////////////////////////////////////////////////
28690 +// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
28691 +ENTRY(alt_itlb_miss)
28692 +       DBG_FAULT(3)
28693 +#ifdef CONFIG_XEN
28694 +       movl r31=XSI_IPSR
28695 +       ;;
28696 +       ld8 r21=[r31],XSI_IFA_OFS-XSI_IPSR_OFS  // get ipsr, point to ifa
28697 +       movl r17=PAGE_KERNEL
28698 +       ;;
28699 +       ld8 r16=[r31]           // get ifa
28700 +#else
28701 +       mov r16=cr.ifa          // get address that caused the TLB miss
28702 +       movl r17=PAGE_KERNEL
28703 +       mov r21=cr.ipsr
28704 +#endif
28705 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28706 +       mov r31=pr
28707 +       ;;
28708 +#ifdef CONFIG_DISABLE_VHPT
28709 +       shr.u r22=r16,61                        // get the region number into r21
28710 +       ;;
28711 +       cmp.gt p8,p0=6,r22                      // user mode
28712 +       ;;
28713 +#ifndef CONFIG_XEN
28714 +(p8)   thash r17=r16
28715 +       ;;
28716 +(p8)   mov cr.iha=r17
28717 +#endif
28718 +(p8)   mov r29=b0                              // save b0
28719 +(p8)   br.cond.dptk .itlb_fault
28720 +#endif
28721 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
28722 +       and r19=r19,r16         // clear ed, reserved bits, and PTE control bits
28723 +       shr.u r18=r16,57        // move address bit 61 to bit 4
28724 +       ;;
28725 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
28726 +       cmp.ne p8,p0=r0,r23     // psr.cpl != 0?
28727 +       or r19=r17,r19          // insert PTE control bits into r19
28728 +       ;;
28729 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
28730 +(p8)   br.cond.spnt page_fault
28731 +       ;;
28732 +#ifdef CONFIG_XEN
28733 +       mov r18=r8
28734 +       mov r8=r19
28735 +       ;;
28736 +       XEN_HYPER_ITC_I
28737 +       ;;
28738 +       mov r8=r18
28739 +       ;;
28740 +       mov pr=r31,-1
28741 +       ;;
28742 +       XEN_HYPER_RFI;
28743 +#else
28744 +       itc.i r19               // insert the TLB entry
28745 +       mov pr=r31,-1
28746 +       rfi
28747 +#endif
28748 +END(alt_itlb_miss)
28749 +
28750 +       .org ia64_ivt+0x1000
28751 +/////////////////////////////////////////////////////////////////////////////////////////
28752 +// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
28753 +ENTRY(alt_dtlb_miss)
28754 +       DBG_FAULT(4)
28755 +#ifdef CONFIG_XEN
28756 +       movl r31=XSI_IPSR
28757 +       ;;
28758 +       ld8 r21=[r31],XSI_ISR_OFS-XSI_IPSR_OFS  // get ipsr, point to isr
28759 +       movl r17=PAGE_KERNEL
28760 +       ;;
28761 +       ld8 r20=[r31],XSI_IFA_OFS-XSI_ISR_OFS   // get isr, point to ifa
28762 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28763 +       ;;
28764 +       ld8 r16=[r31]           // get ifa
28765 +#else
28766 +       mov r16=cr.ifa          // get address that caused the TLB miss
28767 +       movl r17=PAGE_KERNEL
28768 +       mov r20=cr.isr
28769 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28770 +       mov r21=cr.ipsr
28771 +#endif
28772 +       mov r31=pr
28773 +       ;;
28774 +#ifdef CONFIG_DISABLE_VHPT
28775 +       shr.u r22=r16,61                        // get the region number into r21
28776 +       ;;
28777 +       cmp.gt p8,p0=6,r22                      // access to region 0-5
28778 +       ;;
28779 +#ifndef CONFIG_XEN
28780 +(p8)   thash r17=r16
28781 +       ;;
28782 +(p8)   mov cr.iha=r17
28783 +#endif
28784 +(p8)   mov r29=b0                              // save b0
28785 +(p8)   br.cond.dptk dtlb_fault
28786 +#endif
28787 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
28788 +       and r22=IA64_ISR_CODE_MASK,r20          // get the isr.code field
28789 +       tbit.nz p6,p7=r20,IA64_ISR_SP_BIT       // is speculation bit on?
28790 +       shr.u r18=r16,57                        // move address bit 61 to bit 4
28791 +       and r19=r19,r16                         // clear ed, reserved bits, and PTE control bits
28792 +       tbit.nz p9,p0=r20,IA64_ISR_NA_BIT       // is non-access bit on?
28793 +       ;;
28794 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
28795 +       cmp.ne p8,p0=r0,r23
28796 +(p9)   cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22  // check isr.code field
28797 +(p8)   br.cond.spnt page_fault
28798 +
28799 +       dep r21=-1,r21,IA64_PSR_ED_BIT,1
28800 +       or r19=r19,r17          // insert PTE control bits into r19
28801 +       ;;
28802 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
28803 +(p6)   mov cr.ipsr=r21
28804 +       ;;
28805 +#ifdef CONFIG_XEN
28806 +(p7)   mov r18=r8
28807 +(p7)   mov r8=r19
28808 +       ;;
28809 +(p7)   XEN_HYPER_ITC_D
28810 +       ;;
28811 +(p7)   mov r8=r18
28812 +       ;;
28813 +       mov pr=r31,-1
28814 +       ;;
28815 +       XEN_HYPER_RFI;
28816 +#else
28817 +(p7)   itc.d r19               // insert the TLB entry
28818 +       mov pr=r31,-1
28819 +       rfi
28820 +#endif
28821 +END(alt_dtlb_miss)
28822 +
28823 +       .org ia64_ivt+0x1400
28824 +/////////////////////////////////////////////////////////////////////////////////////////
28825 +// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
28826 +ENTRY(nested_dtlb_miss)
28827 +       /*
28828 +        * In the absence of kernel bugs, we get here when the virtually mapped linear
28829 +        * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
28830 +        * Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
28831 +        * table is missing, a nested TLB miss fault is triggered and control is
28832 +        * transferred to this point.  When this happens, we lookup the pte for the
28833 +        * faulting address by walking the page table in physical mode and return to the
28834 +        * continuation point passed in register r30 (or call page_fault if the address is
28835 +        * not mapped).
28836 +        *
28837 +        * Input:       r16:    faulting address
28838 +        *              r29:    saved b0
28839 +        *              r30:    continuation address
28840 +        *              r31:    saved pr
28841 +        *
28842 +        * Output:      r17:    physical address of PTE of faulting address
28843 +        *              r29:    saved b0
28844 +        *              r30:    continuation address
28845 +        *              r31:    saved pr
28846 +        *
28847 +        * Clobbered:   b0, r18, r19, r21, r22, psr.dt (cleared)
28848 +        */
28849 +#ifdef CONFIG_XEN
28850 +       XEN_HYPER_RSM_PSR_DT;
28851 +#else
28852 +       rsm psr.dt                              // switch to using physical data addressing
28853 +#endif
28854 +       mov r19=IA64_KR(PT_BASE)                // get the page table base address
28855 +       shl r21=r16,3                           // shift bit 60 into sign bit
28856 +#ifdef CONFIG_XEN
28857 +       movl r18=XSI_ITIR
28858 +       ;;
28859 +       ld8 r18=[r18]
28860 +#else
28861 +       mov r18=cr.itir
28862 +#endif
28863 +       ;;
28864 +       shr.u r17=r16,61                        // get the region number into r17
28865 +       extr.u r18=r18,2,6                      // get the faulting page size
28866 +       ;;
28867 +       cmp.eq p6,p7=5,r17                      // is faulting address in region 5?
28868 +       add r22=-PAGE_SHIFT,r18                 // adjustment for hugetlb address
28869 +       add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
28870 +       ;;
28871 +       shr.u r22=r16,r22
28872 +       shr.u r18=r16,r18
28873 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
28874 +
28875 +       srlz.d
28876 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
28877 +
28878 +       .pred.rel "mutex", p6, p7
28879 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
28880 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
28881 +       ;;
28882 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=pgd_offset for region 5
28883 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=pgd_offset for region[0-4]
28884 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
28885 +#ifdef CONFIG_PGTABLE_4
28886 +       shr.u r18=r22,PUD_SHIFT                 // shift pud index into position
28887 +#else
28888 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
28889 +#endif
28890 +       ;;
28891 +       ld8 r17=[r17]                           // get *pgd (may be 0)
28892 +       ;;
28893 +(p7)   cmp.eq p6,p7=r17,r0                     // was pgd_present(*pgd) == NULL?
28894 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // r17=p[u|m]d_offset(pgd,addr)
28895 +       ;;
28896 +#ifdef CONFIG_PGTABLE_4
28897 +(p7)   ld8 r17=[r17]                           // get *pud (may be 0)
28898 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
28899 +       ;;
28900 +(p7)   cmp.eq.or.andcm p6,p7=r17,r0            // was pud_present(*pud) == NULL?
28901 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // r17=pmd_offset(pud,addr)
28902 +       ;;
28903 +#endif
28904 +(p7)   ld8 r17=[r17]                           // get *pmd (may be 0)
28905 +       shr.u r19=r22,PAGE_SHIFT                // shift pte index into position
28906 +       ;;
28907 +(p7)   cmp.eq.or.andcm p6,p7=r17,r0            // was pmd_present(*pmd) == NULL?
28908 +       dep r17=r19,r17,3,(PAGE_SHIFT-3)        // r17=pte_offset(pmd,addr);
28909 +(p6)   br.cond.spnt page_fault
28910 +       mov b0=r30
28911 +       br.sptk.many b0                         // return to continuation point
28912 +END(nested_dtlb_miss)
28913 +
28914 +       .org ia64_ivt+0x1800
28915 +/////////////////////////////////////////////////////////////////////////////////////////
28916 +// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
28917 +ENTRY(ikey_miss)
28918 +       DBG_FAULT(6)
28919 +       FAULT(6)
28920 +END(ikey_miss)
28921 +
28922 +       //-----------------------------------------------------------------------------------
28923 +       // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
28924 +ENTRY(page_fault)
28925 +#ifdef CONFIG_XEN
28926 +       XEN_HYPER_SSM_PSR_DT
28927 +#else
28928 +       ssm psr.dt
28929 +       ;;
28930 +       srlz.i
28931 +#endif
28932 +       ;;
28933 +       SAVE_MIN_WITH_COVER
28934 +       alloc r15=ar.pfs,0,0,3,0
28935 +#ifdef CONFIG_XEN
28936 +       movl r3=XSI_ISR
28937 +       ;;
28938 +       ld8 out1=[r3],XSI_IFA_OFS-XSI_ISR_OFS   // get vcr.isr, point to ifa
28939 +       ;;
28940 +       ld8 out0=[r3]                           // get vcr.ifa
28941 +       mov r14=1
28942 +       ;;
28943 +       add r3=XSI_PSR_IC_OFS-XSI_IFA_OFS, r3   // point to vpsr.ic
28944 +       ;;
28945 +       st4 [r3]=r14                            // vpsr.ic = 1
28946 +       adds r3=8,r2                            // set up second base pointer
28947 +       ;;
28948 +#else
28949 +       mov out0=cr.ifa
28950 +       mov out1=cr.isr
28951 +       adds r3=8,r2                            // set up second base pointer
28952 +       ;;
28953 +       ssm psr.ic | PSR_DEFAULT_BITS
28954 +       ;;
28955 +       srlz.i                                  // guarantee that interruption collectin is on
28956 +       ;;
28957 +#endif
28958 +#ifdef CONFIG_XEN
28959 +       br.cond.sptk.many       xen_page_fault
28960 +       ;;
28961 +done_xen_page_fault:
28962 +#endif
28963 +(p15)  ssm psr.i                               // restore psr.i
28964 +       movl r14=ia64_leave_kernel
28965 +       ;;
28966 +       SAVE_REST
28967 +       mov rp=r14
28968 +       ;;
28969 +       adds out2=16,r12                        // out2 = pointer to pt_regs
28970 +       br.call.sptk.many b6=ia64_do_page_fault // ignore return address
28971 +END(page_fault)
28972 +
28973 +       .org ia64_ivt+0x1c00
28974 +/////////////////////////////////////////////////////////////////////////////////////////
28975 +// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
28976 +ENTRY(dkey_miss)
28977 +       DBG_FAULT(7)
28978 +       FAULT(7)
28979 +#ifdef CONFIG_XEN
28980 +       // Leaving this code inline above results in an IVT section overflow
28981 +       // There is no particular reason for this code to be here...
28982 +xen_page_fault:
28983 +(p15)  movl r3=XSI_PSR_I_ADDR
28984 +       ;;
28985 +(p15)  ld8 r3=[r3]
28986 +       ;;
28987 +(p15)  st1 [r3]=r0,-1  // if (p15) vpsr.i = 1
28988 +       mov r14=r0
28989 +       ;;
28990 +(p15)  ld1 r14=[r3]                            // if (pending_events)
28991 +       adds r3=8,r2                            // re-set up second base pointer
28992 +       ;;
28993 +(p15)  cmp.ne  p15,p0=r14,r0
28994 +       ;;
28995 +       br.cond.sptk.many done_xen_page_fault
28996 +       ;;
28997 +#endif
28998 +END(dkey_miss)
28999 +
29000 +       .org ia64_ivt+0x2000
29001 +/////////////////////////////////////////////////////////////////////////////////////////
29002 +// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
29003 +ENTRY(dirty_bit)
29004 +       DBG_FAULT(8)
29005 +       /*
29006 +        * What we do here is to simply turn on the dirty bit in the PTE.  We need to
29007 +        * update both the page-table and the TLB entry.  To efficiently access the PTE,
29008 +        * we address it through the virtual page table.  Most likely, the TLB entry for
29009 +        * the relevant virtual page table page is still present in the TLB so we can
29010 +        * normally do this without additional TLB misses.  In case the necessary virtual
29011 +        * page table TLB entry isn't present, we take a nested TLB miss hit where we look
29012 +        * up the physical address of the L3 PTE and then continue at label 1 below.
29013 +        */
29014 +#ifdef CONFIG_XEN
29015 +       movl r16=XSI_IFA
29016 +       ;;
29017 +       ld8 r16=[r16]
29018 +       ;;
29019 +#else
29020 +       mov r16=cr.ifa                          // get the address that caused the fault
29021 +#endif
29022 +       movl r30=1f                             // load continuation point in case of nested fault
29023 +       ;;
29024 +#ifdef CONFIG_XEN
29025 +       mov r18=r8;
29026 +       mov r8=r16;
29027 +       XEN_HYPER_THASH;;
29028 +       mov r17=r8;
29029 +       mov r8=r18;;
29030 +#else
29031 +       thash r17=r16                           // compute virtual address of L3 PTE
29032 +#endif
29033 +       mov r29=b0                              // save b0 in case of nested fault
29034 +       mov r31=pr                              // save pr
29035 +#ifdef CONFIG_SMP
29036 +       mov r28=ar.ccv                          // save ar.ccv
29037 +       ;;
29038 +1:     ld8 r18=[r17]
29039 +       ;;                                      // avoid RAW on r18
29040 +       mov ar.ccv=r18                          // set compare value for cmpxchg
29041 +       or r25=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
29042 +       tbit.z p7,p6 = r18,_PAGE_P_BIT          // Check present bit
29043 +       ;;
29044 +(p6)   cmpxchg8.acq r26=[r17],r25,ar.ccv       // Only update if page is present
29045 +       mov r24=PAGE_SHIFT<<2
29046 +       ;;
29047 +(p6)   cmp.eq p6,p7=r26,r18                    // Only compare if page is present
29048 +       ;;
29049 +#ifdef CONFIG_XEN
29050 +(p6)   mov r18=r8
29051 +(p6)   mov r8=r25
29052 +       ;;
29053 +(p6)   XEN_HYPER_ITC_D
29054 +       ;;
29055 +(p6)   mov r8=r18
29056 +#else
29057 +(p6)   itc.d r25                               // install updated PTE
29058 +#endif 
29059 +       ;;
29060 +       /*
29061 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
29062 +        * cannot possibly affect the following loads:
29063 +        */
29064 +       dv_serialize_data
29065 +
29066 +       ld8 r18=[r17]                           // read PTE again
29067 +       ;;
29068 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
29069 +       ;;
29070 +(p7)   ptc.l r16,r24
29071 +       mov b0=r29                              // restore b0
29072 +       mov ar.ccv=r28
29073 +#else
29074 +       ;;
29075 +1:     ld8 r18=[r17]
29076 +       ;;                                      // avoid RAW on r18
29077 +       or r18=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
29078 +       mov b0=r29                              // restore b0
29079 +       ;;
29080 +       st8 [r17]=r18                           // store back updated PTE
29081 +       itc.d r18                               // install updated PTE
29082 +#endif
29083 +       mov pr=r31,-1                           // restore pr
29084 +#ifdef CONFIG_XEN
29085 +       XEN_HYPER_RFI
29086 +       dv_serialize_data
29087 +#else
29088 +       rfi
29089 +#endif
29090 +END(dirty_bit)
29091 +
29092 +       .org ia64_ivt+0x2400
29093 +/////////////////////////////////////////////////////////////////////////////////////////
29094 +// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
29095 +ENTRY(iaccess_bit)
29096 +       DBG_FAULT(9)
29097 +       // Like Entry 8, except for instruction access
29098 +#ifdef CONFIG_XEN
29099 +       movl r16=XSI_IFA
29100 +       ;;
29101 +       ld8 r16=[r16]
29102 +       ;;
29103 +#else
29104 +       mov r16=cr.ifa                          // get the address that caused the fault
29105 +#endif
29106 +       movl r30=1f                             // load continuation point in case of nested fault
29107 +       mov r31=pr                              // save predicates
29108 +#ifdef CONFIG_ITANIUM
29109 +       /*
29110 +        * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
29111 +        */
29112 +       mov r17=cr.ipsr
29113 +       ;;
29114 +       mov r18=cr.iip
29115 +       tbit.z p6,p0=r17,IA64_PSR_IS_BIT        // IA64 instruction set?
29116 +       ;;
29117 +(p6)   mov r16=r18                             // if so, use cr.iip instead of cr.ifa
29118 +#endif /* CONFIG_ITANIUM */
29119 +       ;;
29120 +#ifdef CONFIG_XEN
29121 +       mov r18=r8;
29122 +       mov r8=r16;
29123 +       XEN_HYPER_THASH;;
29124 +       mov r17=r8;
29125 +       mov r8=r18;;
29126 +#else
29127 +       thash r17=r16                           // compute virtual address of L3 PTE
29128 +#endif
29129 +       mov r29=b0                              // save b0 in case of nested fault)
29130 +#ifdef CONFIG_SMP
29131 +       mov r28=ar.ccv                          // save ar.ccv
29132 +       ;;
29133 +1:     ld8 r18=[r17]
29134 +       ;;
29135 +       mov ar.ccv=r18                          // set compare value for cmpxchg
29136 +       or r25=_PAGE_A,r18                      // set the accessed bit
29137 +       tbit.z p7,p6 = r18,_PAGE_P_BIT          // Check present bit
29138 +       ;;
29139 +(p6)   cmpxchg8.acq r26=[r17],r25,ar.ccv       // Only if page present
29140 +       mov r24=PAGE_SHIFT<<2
29141 +       ;;
29142 +(p6)   cmp.eq p6,p7=r26,r18                    // Only if page present
29143 +       ;;
29144 +#ifdef CONFIG_XEN
29145 +       mov r26=r8
29146 +       mov r8=r25
29147 +       ;;
29148 +(p6)   XEN_HYPER_ITC_I
29149 +       ;;
29150 +       mov r8=r26
29151 +       ;;
29152 +#else
29153 +(p6)   itc.i r25                               // install updated PTE
29154 +#endif
29155 +       ;;
29156 +       /*
29157 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
29158 +        * cannot possibly affect the following loads:
29159 +        */
29160 +       dv_serialize_data
29161 +
29162 +       ld8 r18=[r17]                           // read PTE again
29163 +       ;;
29164 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
29165 +       ;;
29166 +(p7)   ptc.l r16,r24
29167 +       mov b0=r29                              // restore b0
29168 +       mov ar.ccv=r28
29169 +#else /* !CONFIG_SMP */
29170 +       ;;
29171 +1:     ld8 r18=[r17]
29172 +       ;;
29173 +       or r18=_PAGE_A,r18                      // set the accessed bit
29174 +       mov b0=r29                              // restore b0
29175 +       ;;
29176 +       st8 [r17]=r18                           // store back updated PTE
29177 +       itc.i r18                               // install updated PTE
29178 +#endif /* !CONFIG_SMP */
29179 +       mov pr=r31,-1
29180 +#ifdef CONFIG_XEN
29181 +       XEN_HYPER_RFI
29182 +       dv_serialize_data
29183 +#else
29184 +       rfi
29185 +#endif
29186 +END(iaccess_bit)
29187 +
29188 +       .org ia64_ivt+0x2800
29189 +/////////////////////////////////////////////////////////////////////////////////////////
29190 +// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
29191 +ENTRY(daccess_bit)
29192 +       DBG_FAULT(10)
29193 +       // Like Entry 8, except for data access
29194 +#ifdef CONFIG_XEN
29195 +       movl r16=XSI_IFA
29196 +       ;;
29197 +       ld8 r16=[r16]
29198 +       ;;
29199 +#else
29200 +       mov r16=cr.ifa                          // get the address that caused the fault
29201 +#endif
29202 +       movl r30=1f                             // load continuation point in case of nested fault
29203 +       ;;
29204 +#ifdef CONFIG_XEN
29205 +       mov r18=r8
29206 +       mov r8=r16
29207 +       XEN_HYPER_THASH
29208 +       ;;
29209 +       mov r17=r8
29210 +       mov r8=r18
29211 +       ;;
29212 +#else
29213 +       thash r17=r16                           // compute virtual address of L3 PTE
29214 +#endif
29215 +       mov r31=pr
29216 +       mov r29=b0                              // save b0 in case of nested fault)
29217 +#ifdef CONFIG_SMP
29218 +       mov r28=ar.ccv                          // save ar.ccv
29219 +       ;;
29220 +1:     ld8 r18=[r17]
29221 +       ;;                                      // avoid RAW on r18
29222 +       mov ar.ccv=r18                          // set compare value for cmpxchg
29223 +       or r25=_PAGE_A,r18                      // set the dirty bit
29224 +       tbit.z p7,p6 = r18,_PAGE_P_BIT          // Check present bit
29225 +       ;;
29226 +(p6)   cmpxchg8.acq r26=[r17],r25,ar.ccv       // Only if page is present
29227 +       mov r24=PAGE_SHIFT<<2
29228 +       ;;
29229 +(p6)   cmp.eq p6,p7=r26,r18                    // Only if page is present
29230 +       ;;
29231 +#ifdef CONFIG_XEN
29232 +       mov r26=r8
29233 +       mov r8=r25
29234 +       ;;
29235 +(p6)   XEN_HYPER_ITC_D
29236 +       ;;
29237 +       mov r8=r26
29238 +       ;;
29239 +#else
29240 +(p6)   itc.d r25                               // install updated PTE
29241 +#endif
29242 +       /*
29243 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
29244 +        * cannot possibly affect the following loads:
29245 +        */
29246 +       dv_serialize_data
29247 +       ;;
29248 +       ld8 r18=[r17]                           // read PTE again
29249 +       ;;
29250 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
29251 +       ;;
29252 +(p7)   ptc.l r16,r24
29253 +       mov ar.ccv=r28
29254 +#else
29255 +       ;;
29256 +1:     ld8 r18=[r17]
29257 +       ;;                                      // avoid RAW on r18
29258 +       or r18=_PAGE_A,r18                      // set the accessed bit
29259 +       ;;
29260 +       st8 [r17]=r18                           // store back updated PTE
29261 +       itc.d r18                               // install updated PTE
29262 +#endif
29263 +       mov b0=r29                              // restore b0
29264 +       mov pr=r31,-1
29265 +#ifdef CONFIG_XEN
29266 +       XEN_HYPER_RFI
29267 +       dv_serialize_data
29268 +#else
29269 +       rfi
29270 +#endif
29271 +END(daccess_bit)
29272 +
29273 +       .org ia64_ivt+0x2c00
29274 +/////////////////////////////////////////////////////////////////////////////////////////
29275 +// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
29276 +ENTRY(break_fault)
29277 +       /*
29278 +        * The streamlined system call entry/exit paths only save/restore the initial part
29279 +        * of pt_regs.  This implies that the callers of system-calls must adhere to the
29280 +        * normal procedure calling conventions.
29281 +        *
29282 +        *   Registers to be saved & restored:
29283 +        *      CR registers: cr.ipsr, cr.iip, cr.ifs
29284 +        *      AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
29285 +        *      others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
29286 +        *   Registers to be restored only:
29287 +        *      r8-r11: output value from the system call.
29288 +        *
29289 +        * During system call exit, scratch registers (including r15) are modified/cleared
29290 +        * to prevent leaking bits from kernel to user level.
29291 +        */
29292 +       DBG_FAULT(11)
29293 +       mov.m r16=IA64_KR(CURRENT)              // M2 r16 <- current task (12 cyc)
29294 +#ifdef CONFIG_XEN
29295 +       movl r22=XSI_IPSR
29296 +       ;;
29297 +       ld8 r29=[r22],XSI_IIM_OFS-XSI_IPSR_OFS  // get ipsr, point to iip
29298 +#else
29299 +       mov r29=cr.ipsr                         // M2 (12 cyc)
29300 +#endif
29301 +       mov r31=pr                              // I0 (2 cyc)
29302 +
29303 +#ifdef CONFIG_XEN
29304 +       ;;
29305 +       ld8 r17=[r22],XSI_IIP_OFS-XSI_IIM_OFS
29306 +#else
29307 +       mov r17=cr.iim                          // M2 (2 cyc)
29308 +#endif
29309 +       mov.m r27=ar.rsc                        // M2 (12 cyc)
29310 +       mov r18=__IA64_BREAK_SYSCALL            // A
29311 +
29312 +       mov.m ar.rsc=0                          // M2
29313 +       mov.m r21=ar.fpsr                       // M2 (12 cyc)
29314 +       mov r19=b6                              // I0 (2 cyc)
29315 +       ;;
29316 +       mov.m r23=ar.bspstore                   // M2 (12 cyc)
29317 +       mov.m r24=ar.rnat                       // M2 (5 cyc)
29318 +       mov.i r26=ar.pfs                        // I0 (2 cyc)
29319 +
29320 +       invala                                  // M0|1
29321 +       nop.m 0                                 // M
29322 +       mov r20=r1                              // A                    save r1
29323 +
29324 +       nop.m 0
29325 +       movl r30=sys_call_table                 // X
29326 +
29327 +#ifdef CONFIG_XEN
29328 +       ld8 r28=[r22]
29329 +#else
29330 +       mov r28=cr.iip                          // M2 (2 cyc)
29331 +#endif
29332 +       cmp.eq p0,p7=r18,r17                    // I0 is this a system call?
29333 +(p7)   br.cond.spnt non_syscall                // B  no ->
29334 +       //
29335 +       // From this point on, we are definitely on the syscall-path
29336 +       // and we can use (non-banked) scratch registers.
29337 +       //
29338 +///////////////////////////////////////////////////////////////////////
29339 +       mov r1=r16                              // A    move task-pointer to "addl"-addressable reg
29340 +       mov r2=r16                              // A    setup r2 for ia64_syscall_setup
29341 +       add r9=TI_FLAGS+IA64_TASK_SIZE,r16      // A    r9 = &current_thread_info()->flags
29342 +
29343 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
29344 +       adds r15=-1024,r15                      // A    subtract 1024 from syscall number
29345 +       mov r3=NR_syscalls - 1
29346 +       ;;
29347 +       ld1.bias r17=[r16]                      // M0|1 r17 = current->thread.on_ustack flag
29348 +       ld4 r9=[r9]                             // M0|1 r9 = current_thread_info()->flags
29349 +       extr.u r8=r29,41,2                      // I0   extract ei field from cr.ipsr
29350 +
29351 +       shladd r30=r15,3,r30                    // A    r30 = sys_call_table + 8*(syscall-1024)
29352 +       addl r22=IA64_RBS_OFFSET,r1             // A    compute base of RBS
29353 +       cmp.leu p6,p7=r15,r3                    // A    syscall number in range?
29354 +       ;;
29355 +
29356 +       lfetch.fault.excl.nt1 [r22]             // M0|1 prefetch RBS
29357 +(p6)   ld8 r30=[r30]                           // M0|1 load address of syscall entry point
29358 +       tnat.nz.or p7,p0=r15                    // I0   is syscall nr a NaT?
29359 +
29360 +       mov.m ar.bspstore=r22                   // M2   switch to kernel RBS
29361 +       cmp.eq p8,p9=2,r8                       // A    isr.ei==2?
29362 +       ;;
29363 +
29364 +(p8)   mov r8=0                                // A    clear ei to 0
29365 +(p7)   movl r30=sys_ni_syscall                 // X
29366 +
29367 +(p8)   adds r28=16,r28                         // A    switch cr.iip to next bundle
29368 +(p9)   adds r8=1,r8                            // A    increment ei to next slot
29369 +       nop.i 0
29370 +       ;;
29371 +
29372 +       mov.m r25=ar.unat                       // M2 (5 cyc)
29373 +       dep r29=r8,r29,41,2                     // I0   insert new ei into cr.ipsr
29374 +       adds r15=1024,r15                       // A    restore original syscall number
29375 +       //
29376 +       // If any of the above loads miss in L1D, we'll stall here until
29377 +       // the data arrives.
29378 +       //
29379 +///////////////////////////////////////////////////////////////////////
29380 +       st1 [r16]=r0                            // M2|3 clear current->thread.on_ustack flag
29381 +       mov b6=r30                              // I0   setup syscall handler branch reg early
29382 +       cmp.eq pKStk,pUStk=r0,r17               // A    were we on kernel stacks already?
29383 +
29384 +       and r9=_TIF_SYSCALL_TRACEAUDIT,r9       // A    mask trace or audit
29385 +       mov r18=ar.bsp                          // M2 (12 cyc)
29386 +(pKStk)        br.cond.spnt .break_fixup               // B    we're already in kernel-mode -- fix up RBS
29387 +       ;;
29388 +.back_from_break_fixup:
29389 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A    compute base of memory stack
29390 +       cmp.eq p14,p0=r9,r0                     // A    are syscalls being traced/audited?
29391 +       br.call.sptk.many b7=ia64_syscall_setup // B
29392 +1:
29393 +       mov ar.rsc=0x3                          // M2   set eager mode, pl 0, LE, loadrs=0
29394 +       nop 0
29395 +#ifdef CONFIG_XEN
29396 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;;
29397 +#else
29398 +       bsw.1                                   // B (6 cyc) regs are saved, switch to bank 1
29399 +#endif
29400 +       ;;
29401 +
29402 +#ifdef CONFIG_XEN
29403 +       movl r16=XSI_PSR_IC
29404 +       mov r3=1
29405 +       ;;
29406 +       st4 [r16]=r3,XSI_PSR_I_ADDR_OFS-XSI_PSR_IC_OFS  // vpsr.ic = 1
29407 +#else
29408 +       ssm psr.ic | PSR_DEFAULT_BITS           // M2   now it's safe to re-enable intr.-collection
29409 +#endif
29410 +       movl r3=ia64_ret_from_syscall           // X
29411 +       ;;
29412 +
29413 +       srlz.i                                  // M0   ensure interruption collection is on
29414 +       mov rp=r3                               // I0   set the real return addr
29415 +(p10)  br.cond.spnt.many ia64_ret_from_syscall // B    return if bad call-frame or r15 is a NaT
29416 +
29417 +#ifdef CONFIG_XEN
29418 +(p15)  ld8 r16=[r16]                           // vpsr.i
29419 +       ;;
29420 +(p15)  st1 [r16]=r0,-1         // if (p15) vpsr.i = 1
29421 +       mov r2=r0
29422 +       ;;
29423 +(p15)  ld1 r2=[r16]                            // if (pending_events)
29424 +       ;;
29425 +       cmp.ne  p6,p0=r2,r0
29426 +       ;;
29427 +(p6)   ssm     psr.i                           //   do a real ssm psr.i
29428 +#else
29429 +(p15)  ssm psr.i                               // M2   restore psr.i
29430 +#endif
29431 +(p14)  br.call.sptk.many b6=b6                 // B    invoke syscall-handker (ignore return addr)
29432 +       br.cond.spnt.many ia64_trace_syscall    // B    do syscall-tracing thingamagic
29433 +       // NOT REACHED
29434 +///////////////////////////////////////////////////////////////////////
29435 +       // On entry, we optimistically assumed that we're coming from user-space.
29436 +       // For the rare cases where a system-call is done from within the kernel,
29437 +       // we fix things up at this point:
29438 +.break_fixup:
29439 +       add r1=-IA64_PT_REGS_SIZE,sp            // A    allocate space for pt_regs structure
29440 +       mov ar.rnat=r24                         // M2   restore kernel's AR.RNAT
29441 +       ;;
29442 +       mov ar.bspstore=r23                     // M2   restore kernel's AR.BSPSTORE
29443 +       br.cond.sptk .back_from_break_fixup
29444 +END(break_fault)
29445 +
29446 +       .org ia64_ivt+0x3000
29447 +/////////////////////////////////////////////////////////////////////////////////////////
29448 +// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
29449 +ENTRY(interrupt)
29450 +       DBG_FAULT(12)
29451 +       mov r31=pr              // prepare to save predicates
29452 +       ;;
29453 +       SAVE_MIN_WITH_COVER     // uses r31; defines r2 and r3
29454 +#ifdef CONFIG_XEN
29455 +       movl r3=XSI_PSR_IC
29456 +       mov r14=1
29457 +       ;;
29458 +       st4 [r3]=r14
29459 +#else
29460 +       ssm psr.ic | PSR_DEFAULT_BITS
29461 +#endif
29462 +       ;;
29463 +       adds r3=8,r2            // set up second base pointer for SAVE_REST
29464 +       srlz.i                  // ensure everybody knows psr.ic is back on
29465 +       ;;
29466 +       SAVE_REST
29467 +       ;;
29468 +       alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
29469 +#ifdef CONFIG_XEN
29470 +       ;;
29471 +       br.call.sptk.many rp=xen_get_ivr
29472 +       ;;
29473 +       mov out0=r8             // pass cr.ivr as first arg
29474 +#else
29475 +       mov out0=cr.ivr         // pass cr.ivr as first arg
29476 +#endif
29477 +       add out1=16,sp          // pass pointer to pt_regs as second arg
29478 +       ;;
29479 +       srlz.d                  // make sure we see the effect of cr.ivr
29480 +       movl r14=ia64_leave_kernel
29481 +       ;;
29482 +       mov rp=r14
29483 +       br.call.sptk.many b6=ia64_handle_irq
29484 +END(interrupt)
29485 +
29486 +       .org ia64_ivt+0x3400
29487 +/////////////////////////////////////////////////////////////////////////////////////////
29488 +// 0x3400 Entry 13 (size 64 bundles) Reserved
29489 +       DBG_FAULT(13)
29490 +       FAULT(13)
29491 +
29492 +       .org ia64_ivt+0x3800
29493 +/////////////////////////////////////////////////////////////////////////////////////////
29494 +// 0x3800 Entry 14 (size 64 bundles) Reserved
29495 +       DBG_FAULT(14)
29496 +       FAULT(14)
29497 +
29498 +       /*
29499 +        * There is no particular reason for this code to be here, other than that
29500 +        * there happens to be space here that would go unused otherwise.  If this
29501 +        * fault ever gets "unreserved", simply moved the following code to a more
29502 +        * suitable spot...
29503 +        *
29504 +        * ia64_syscall_setup() is a separate subroutine so that it can
29505 +        *      allocate stacked registers so it can safely demine any
29506 +        *      potential NaT values from the input registers.
29507 +        *
29508 +        * On entry:
29509 +        *      - executing on bank 0 or bank 1 register set (doesn't matter)
29510 +        *      -  r1: stack pointer
29511 +        *      -  r2: current task pointer
29512 +        *      -  r3: preserved
29513 +        *      - r11: original contents (saved ar.pfs to be saved)
29514 +        *      - r12: original contents (sp to be saved)
29515 +        *      - r13: original contents (tp to be saved)
29516 +        *      - r15: original contents (syscall # to be saved)
29517 +        *      - r18: saved bsp (after switching to kernel stack)
29518 +        *      - r19: saved b6
29519 +        *      - r20: saved r1 (gp)
29520 +        *      - r21: saved ar.fpsr
29521 +        *      - r22: kernel's register backing store base (krbs_base)
29522 +        *      - r23: saved ar.bspstore
29523 +        *      - r24: saved ar.rnat
29524 +        *      - r25: saved ar.unat
29525 +        *      - r26: saved ar.pfs
29526 +        *      - r27: saved ar.rsc
29527 +        *      - r28: saved cr.iip
29528 +        *      - r29: saved cr.ipsr
29529 +        *      - r31: saved pr
29530 +        *      -  b0: original contents (to be saved)
29531 +        * On exit:
29532 +        *      -  p10: TRUE if syscall is invoked with more than 8 out
29533 +        *              registers or r15's Nat is true
29534 +        *      -  r1: kernel's gp
29535 +        *      -  r3: preserved (same as on entry)
29536 +        *      -  r8: -EINVAL if p10 is true
29537 +        *      - r12: points to kernel stack
29538 +        *      - r13: points to current task
29539 +        *      - r14: preserved (same as on entry)
29540 +        *      - p13: preserved
29541 +        *      - p15: TRUE if interrupts need to be re-enabled
29542 +        *      - ar.fpsr: set to kernel settings
29543 +        *      -  b6: preserved (same as on entry)
29544 +        */
29545 +#ifndef CONFIG_XEN
29546 +GLOBAL_ENTRY(ia64_syscall_setup)
29547 +#if PT(B6) != 0
29548 +# error This code assumes that b6 is the first field in pt_regs.
29549 +#endif
29550 +       st8 [r1]=r19                            // save b6
29551 +       add r16=PT(CR_IPSR),r1                  // initialize first base pointer
29552 +       add r17=PT(R11),r1                      // initialize second base pointer
29553 +       ;;
29554 +       alloc r19=ar.pfs,8,0,0,0                // ensure in0-in7 are writable
29555 +       st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)    // save cr.ipsr
29556 +       tnat.nz p8,p0=in0
29557 +
29558 +       st8.spill [r17]=r11,PT(CR_IIP)-PT(R11)  // save r11
29559 +       tnat.nz p9,p0=in1
29560 +(pKStk)        mov r18=r0                              // make sure r18 isn't NaT
29561 +       ;;
29562 +
29563 +       st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS)     // save ar.pfs
29564 +       st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP)    // save cr.iip
29565 +       mov r28=b0                              // save b0 (2 cyc)
29566 +       ;;
29567 +
29568 +       st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT)    // save ar.unat
29569 +       dep r19=0,r19,38,26                     // clear all bits but 0..37 [I0]
29570 +(p8)   mov in0=-1
29571 +       ;;
29572 +
29573 +       st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS)    // store ar.pfs.pfm in cr.ifs
29574 +       extr.u r11=r19,7,7      // I0           // get sol of ar.pfs
29575 +       and r8=0x7f,r19         // A            // get sof of ar.pfs
29576 +
29577 +       st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
29578 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
29579 +(p9)   mov in1=-1
29580 +       ;;
29581 +
29582 +(pUStk) sub r18=r18,r22                                // r18=RSE.ndirty*8
29583 +       tnat.nz p10,p0=in2
29584 +       add r11=8,r11
29585 +       ;;
29586 +(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16                // skip over ar_rnat field
29587 +(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17    // skip over ar_bspstore field
29588 +       tnat.nz p11,p0=in3
29589 +       ;;
29590 +(p10)  mov in2=-1
29591 +       tnat.nz p12,p0=in4                              // [I0]
29592 +(p11)  mov in3=-1
29593 +       ;;
29594 +(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT)       // save ar.rnat
29595 +(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE)   // save ar.bspstore
29596 +       shl r18=r18,16                          // compute ar.rsc to be used for "loadrs"
29597 +       ;;
29598 +       st8 [r16]=r31,PT(LOADRS)-PT(PR)         // save predicates
29599 +       st8 [r17]=r28,PT(R1)-PT(B0)             // save b0
29600 +       tnat.nz p13,p0=in5                              // [I0]
29601 +       ;;
29602 +       st8 [r16]=r18,PT(R12)-PT(LOADRS)        // save ar.rsc value for "loadrs"
29603 +       st8.spill [r17]=r20,PT(R13)-PT(R1)      // save original r1
29604 +(p12)  mov in4=-1
29605 +       ;;
29606 +
29607 +.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12)       // save r12
29608 +.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13)           // save r13
29609 +(p13)  mov in5=-1
29610 +       ;;
29611 +       st8 [r16]=r21,PT(R8)-PT(AR_FPSR)        // save ar.fpsr
29612 +       tnat.nz p13,p0=in6
29613 +       cmp.lt p10,p9=r11,r8    // frame size can't be more than local+8
29614 +       ;;
29615 +       mov r8=1
29616 +(p9)   tnat.nz p10,p0=r15
29617 +       adds r12=-16,r1         // switch to kernel memory stack (with 16 bytes of scratch)
29618 +
29619 +       st8.spill [r17]=r15                     // save r15
29620 +       tnat.nz p8,p0=in7
29621 +       nop.i 0
29622 +
29623 +       mov r13=r2                              // establish `current'
29624 +       movl r1=__gp                            // establish kernel global pointer
29625 +       ;;
29626 +       st8 [r16]=r8            // ensure pt_regs.r8 != 0 (see handle_syscall_error)
29627 +(p13)  mov in6=-1
29628 +(p8)   mov in7=-1
29629 +
29630 +       cmp.eq pSys,pNonSys=r0,r0               // set pSys=1, pNonSys=0
29631 +       movl r17=FPSR_DEFAULT
29632 +       ;;
29633 +       mov.m ar.fpsr=r17                       // set ar.fpsr to kernel default value
29634 +(p10)  mov r8=-EINVAL
29635 +       br.ret.sptk.many b7
29636 +END(ia64_syscall_setup)
29637 +#endif
29638 +
29639 +       .org ia64_ivt+0x3c00
29640 +/////////////////////////////////////////////////////////////////////////////////////////
29641 +// 0x3c00 Entry 15 (size 64 bundles) Reserved
29642 +       DBG_FAULT(15)
29643 +       FAULT(15)
29644 +
29645 +       /*
29646 +        * Squatting in this space ...
29647 +        *
29648 +        * This special case dispatcher for illegal operation faults allows preserved
29649 +        * registers to be modified through a callback function (asm only) that is handed
29650 +        * back from the fault handler in r8. Up to three arguments can be passed to the
29651 +        * callback function by returning an aggregate with the callback as its first
29652 +        * element, followed by the arguments.
29653 +        */
29654 +ENTRY(dispatch_illegal_op_fault)
29655 +       .prologue
29656 +       .body
29657 +       SAVE_MIN_WITH_COVER
29658 +       ssm psr.ic | PSR_DEFAULT_BITS
29659 +       ;;
29660 +       srlz.i          // guarantee that interruption collection is on
29661 +       ;;
29662 +(p15)  ssm psr.i       // restore psr.i
29663 +       adds r3=8,r2    // set up second base pointer for SAVE_REST
29664 +       ;;
29665 +       alloc r14=ar.pfs,0,0,1,0        // must be first in insn group
29666 +       mov out0=ar.ec
29667 +       ;;
29668 +       SAVE_REST
29669 +       PT_REGS_UNWIND_INFO(0)
29670 +       ;;
29671 +       br.call.sptk.many rp=ia64_illegal_op_fault
29672 +.ret0: ;;
29673 +       alloc r14=ar.pfs,0,0,3,0        // must be first in insn group
29674 +       mov out0=r9
29675 +       mov out1=r10
29676 +       mov out2=r11
29677 +       movl r15=ia64_leave_kernel
29678 +       ;;
29679 +       mov rp=r15
29680 +       mov b6=r8
29681 +       ;;
29682 +       cmp.ne p6,p0=0,r8
29683 +(p6)   br.call.dpnt.many b6=b6         // call returns to ia64_leave_kernel
29684 +       br.sptk.many ia64_leave_kernel
29685 +END(dispatch_illegal_op_fault)
29686 +
29687 +       .org ia64_ivt+0x4000
29688 +/////////////////////////////////////////////////////////////////////////////////////////
29689 +// 0x4000 Entry 16 (size 64 bundles) Reserved
29690 +       DBG_FAULT(16)
29691 +       FAULT(16)
29692 +
29693 +       .org ia64_ivt+0x4400
29694 +/////////////////////////////////////////////////////////////////////////////////////////
29695 +// 0x4400 Entry 17 (size 64 bundles) Reserved
29696 +       DBG_FAULT(17)
29697 +       FAULT(17)
29698 +
29699 +ENTRY(non_syscall)
29700 +       mov ar.rsc=r27                  // restore ar.rsc before SAVE_MIN_WITH_COVER
29701 +       ;;
29702 +       SAVE_MIN_WITH_COVER
29703 +
29704 +       // There is no particular reason for this code to be here, other than that
29705 +       // there happens to be space here that would go unused otherwise.  If this
29706 +       // fault ever gets "unreserved", simply moved the following code to a more
29707 +       // suitable spot...
29708 +
29709 +       alloc r14=ar.pfs,0,0,2,0
29710 +       mov out0=cr.iim
29711 +       add out1=16,sp
29712 +       adds r3=8,r2                    // set up second base pointer for SAVE_REST
29713 +
29714 +       ssm psr.ic | PSR_DEFAULT_BITS
29715 +       ;;
29716 +       srlz.i                          // guarantee that interruption collection is on
29717 +       ;;
29718 +(p15)  ssm psr.i                       // restore psr.i
29719 +       movl r15=ia64_leave_kernel
29720 +       ;;
29721 +       SAVE_REST
29722 +       mov rp=r15
29723 +       ;;
29724 +       br.call.sptk.many b6=ia64_bad_break     // avoid WAW on CFM and ignore return addr
29725 +END(non_syscall)
29726 +
29727 +       .org ia64_ivt+0x4800
29728 +/////////////////////////////////////////////////////////////////////////////////////////
29729 +// 0x4800 Entry 18 (size 64 bundles) Reserved
29730 +       DBG_FAULT(18)
29731 +       FAULT(18)
29732 +
29733 +       /*
29734 +        * There is no particular reason for this code to be here, other than that
29735 +        * there happens to be space here that would go unused otherwise.  If this
29736 +        * fault ever gets "unreserved", simply moved the following code to a more
29737 +        * suitable spot...
29738 +        */
29739 +
29740 +ENTRY(dispatch_unaligned_handler)
29741 +       SAVE_MIN_WITH_COVER
29742 +       ;;
29743 +       alloc r14=ar.pfs,0,0,2,0                // now it's safe (must be first in insn group!)
29744 +       mov out0=cr.ifa
29745 +       adds out1=16,sp
29746 +
29747 +       ssm psr.ic | PSR_DEFAULT_BITS
29748 +       ;;
29749 +       srlz.i                                  // guarantee that interruption collection is on
29750 +       ;;
29751 +(p15)  ssm psr.i                               // restore psr.i
29752 +       adds r3=8,r2                            // set up second base pointer
29753 +       ;;
29754 +       SAVE_REST
29755 +       movl r14=ia64_leave_kernel
29756 +       ;;
29757 +       mov rp=r14
29758 +       br.sptk.many ia64_prepare_handle_unaligned
29759 +END(dispatch_unaligned_handler)
29760 +
29761 +       .org ia64_ivt+0x4c00
29762 +/////////////////////////////////////////////////////////////////////////////////////////
29763 +// 0x4c00 Entry 19 (size 64 bundles) Reserved
29764 +       DBG_FAULT(19)
29765 +       FAULT(19)
29766 +
29767 +       /*
29768 +        * There is no particular reason for this code to be here, other than that
29769 +        * there happens to be space here that would go unused otherwise.  If this
29770 +        * fault ever gets "unreserved", simply moved the following code to a more
29771 +        * suitable spot...
29772 +        */
29773 +
29774 +ENTRY(dispatch_to_fault_handler)
29775 +       /*
29776 +        * Input:
29777 +        *      psr.ic: off
29778 +        *      r19:    fault vector number (e.g., 24 for General Exception)
29779 +        *      r31:    contains saved predicates (pr)
29780 +        */
29781 +       SAVE_MIN_WITH_COVER_R19
29782 +       alloc r14=ar.pfs,0,0,5,0
29783 +       mov out0=r15
29784 +#ifdef CONFIG_XEN
29785 +       movl out1=XSI_ISR
29786 +       ;;
29787 +       adds out2=XSI_IFA-XSI_ISR,out1
29788 +       adds out3=XSI_IIM-XSI_ISR,out1
29789 +       adds out4=XSI_ITIR-XSI_ISR,out1
29790 +       ;;
29791 +       ld8 out1=[out1]
29792 +       ld8 out2=[out2]
29793 +       ld8 out3=[out4]
29794 +       ld8 out4=[out4]
29795 +       ;;
29796 +#else
29797 +       mov out1=cr.isr
29798 +       mov out2=cr.ifa
29799 +       mov out3=cr.iim
29800 +       mov out4=cr.itir
29801 +       ;;
29802 +#endif
29803 +       ssm psr.ic | PSR_DEFAULT_BITS
29804 +       ;;
29805 +       srlz.i                                  // guarantee that interruption collection is on
29806 +       ;;
29807 +(p15)  ssm psr.i                               // restore psr.i
29808 +       adds r3=8,r2                            // set up second base pointer for SAVE_REST
29809 +       ;;
29810 +       SAVE_REST
29811 +       movl r14=ia64_leave_kernel
29812 +       ;;
29813 +       mov rp=r14
29814 +       br.call.sptk.many b6=ia64_fault
29815 +END(dispatch_to_fault_handler)
29816 +
29817 +//
29818 +// --- End of long entries, Beginning of short entries
29819 +//
29820 +
29821 +       .org ia64_ivt+0x5000
29822 +/////////////////////////////////////////////////////////////////////////////////////////
29823 +// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
29824 +ENTRY(page_not_present)
29825 +       DBG_FAULT(20)
29826 +       mov r16=cr.ifa
29827 +       rsm psr.dt
29828 +       /*
29829 +        * The Linux page fault handler doesn't expect non-present pages to be in
29830 +        * the TLB.  Flush the existing entry now, so we meet that expectation.
29831 +        */
29832 +       mov r17=PAGE_SHIFT<<2
29833 +       ;;
29834 +       ptc.l r16,r17
29835 +       ;;
29836 +       mov r31=pr
29837 +       srlz.d
29838 +       br.sptk.many page_fault
29839 +END(page_not_present)
29840 +
29841 +       .org ia64_ivt+0x5100
29842 +/////////////////////////////////////////////////////////////////////////////////////////
29843 +// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
29844 +ENTRY(key_permission)
29845 +       DBG_FAULT(21)
29846 +       mov r16=cr.ifa
29847 +       rsm psr.dt
29848 +       mov r31=pr
29849 +       ;;
29850 +       srlz.d
29851 +       br.sptk.many page_fault
29852 +END(key_permission)
29853 +
29854 +       .org ia64_ivt+0x5200
29855 +/////////////////////////////////////////////////////////////////////////////////////////
29856 +// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
29857 +ENTRY(iaccess_rights)
29858 +       DBG_FAULT(22)
29859 +       mov r16=cr.ifa
29860 +       rsm psr.dt
29861 +       mov r31=pr
29862 +       ;;
29863 +       srlz.d
29864 +       br.sptk.many page_fault
29865 +END(iaccess_rights)
29866 +
29867 +       .org ia64_ivt+0x5300
29868 +/////////////////////////////////////////////////////////////////////////////////////////
29869 +// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
29870 +ENTRY(daccess_rights)
29871 +       DBG_FAULT(23)
29872 +#ifdef CONFIG_XEN
29873 +       movl r16=XSI_IFA
29874 +       ;;
29875 +       ld8 r16=[r16]
29876 +       ;;
29877 +       XEN_HYPER_RSM_PSR_DT
29878 +#else
29879 +       mov r16=cr.ifa
29880 +       rsm psr.dt
29881 +#endif
29882 +       mov r31=pr
29883 +       ;;
29884 +       srlz.d
29885 +       br.sptk.many page_fault
29886 +END(daccess_rights)
29887 +
29888 +       .org ia64_ivt+0x5400
29889 +/////////////////////////////////////////////////////////////////////////////////////////
29890 +// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
29891 +ENTRY(general_exception)
29892 +       DBG_FAULT(24)
29893 +       mov r16=cr.isr
29894 +       mov r31=pr
29895 +       ;;
29896 +       cmp4.eq p6,p0=0,r16
29897 +(p6)   br.sptk.many dispatch_illegal_op_fault
29898 +       ;;
29899 +       mov r19=24              // fault number
29900 +       br.sptk.many dispatch_to_fault_handler
29901 +END(general_exception)
29902 +
29903 +       .org ia64_ivt+0x5500
29904 +/////////////////////////////////////////////////////////////////////////////////////////
29905 +// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
29906 +ENTRY(disabled_fp_reg)
29907 +       DBG_FAULT(25)
29908 +       rsm psr.dfh             // ensure we can access fph
29909 +       ;;
29910 +       srlz.d
29911 +       mov r31=pr
29912 +       mov r19=25
29913 +       br.sptk.many dispatch_to_fault_handler
29914 +END(disabled_fp_reg)
29915 +
29916 +       .org ia64_ivt+0x5600
29917 +/////////////////////////////////////////////////////////////////////////////////////////
29918 +// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
29919 +ENTRY(nat_consumption)
29920 +       DBG_FAULT(26)
29921 +
29922 +       mov r16=cr.ipsr
29923 +       mov r17=cr.isr
29924 +       mov r31=pr                              // save PR
29925 +       ;;
29926 +       and r18=0xf,r17                         // r18 = cr.ipsr.code{3:0}
29927 +       tbit.z p6,p0=r17,IA64_ISR_NA_BIT
29928 +       ;;
29929 +       cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18
29930 +       dep r16=-1,r16,IA64_PSR_ED_BIT,1
29931 +(p6)   br.cond.spnt 1f         // branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
29932 +       ;;
29933 +       mov cr.ipsr=r16         // set cr.ipsr.na
29934 +       mov pr=r31,-1
29935 +       ;;
29936 +       rfi
29937 +
29938 +1:     mov pr=r31,-1
29939 +       ;;
29940 +       FAULT(26)
29941 +END(nat_consumption)
29942 +
29943 +       .org ia64_ivt+0x5700
29944 +/////////////////////////////////////////////////////////////////////////////////////////
29945 +// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
29946 +ENTRY(speculation_vector)
29947 +       DBG_FAULT(27)
29948 +       /*
29949 +        * A [f]chk.[as] instruction needs to take the branch to the recovery code but
29950 +        * this part of the architecture is not implemented in hardware on some CPUs, such
29951 +        * as Itanium.  Thus, in general we need to emulate the behavior.  IIM contains
29952 +        * the relative target (not yet sign extended).  So after sign extending it we
29953 +        * simply add it to IIP.  We also need to reset the EI field of the IPSR to zero,
29954 +        * i.e., the slot to restart into.
29955 +        *
29956 +        * cr.imm contains zero_ext(imm21)
29957 +        */
29958 +       mov r18=cr.iim
29959 +       ;;
29960 +       mov r17=cr.iip
29961 +       shl r18=r18,43                  // put sign bit in position (43=64-21)
29962 +       ;;
29963 +
29964 +       mov r16=cr.ipsr
29965 +       shr r18=r18,39                  // sign extend (39=43-4)
29966 +       ;;
29967 +
29968 +       add r17=r17,r18                 // now add the offset
29969 +       ;;
29970 +       mov cr.iip=r17
29971 +       dep r16=0,r16,41,2              // clear EI
29972 +       ;;
29973 +
29974 +       mov cr.ipsr=r16
29975 +       ;;
29976 +
29977 +#ifdef CONFIG_XEN
29978 +       XEN_HYPER_RFI;
29979 +#else
29980 +       rfi                             // and go back
29981 +#endif
29982 +END(speculation_vector)
29983 +
29984 +       .org ia64_ivt+0x5800
29985 +/////////////////////////////////////////////////////////////////////////////////////////
29986 +// 0x5800 Entry 28 (size 16 bundles) Reserved
29987 +       DBG_FAULT(28)
29988 +       FAULT(28)
29989 +
29990 +       .org ia64_ivt+0x5900
29991 +/////////////////////////////////////////////////////////////////////////////////////////
29992 +// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
29993 +ENTRY(debug_vector)
29994 +       DBG_FAULT(29)
29995 +       FAULT(29)
29996 +END(debug_vector)
29997 +
29998 +       .org ia64_ivt+0x5a00
29999 +/////////////////////////////////////////////////////////////////////////////////////////
30000 +// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
30001 +ENTRY(unaligned_access)
30002 +       DBG_FAULT(30)
30003 +       mov r31=pr              // prepare to save predicates
30004 +       ;;
30005 +       br.sptk.many dispatch_unaligned_handler
30006 +END(unaligned_access)
30007 +
30008 +       .org ia64_ivt+0x5b00
30009 +/////////////////////////////////////////////////////////////////////////////////////////
30010 +// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
30011 +ENTRY(unsupported_data_reference)
30012 +       DBG_FAULT(31)
30013 +       FAULT(31)
30014 +END(unsupported_data_reference)
30015 +
30016 +       .org ia64_ivt+0x5c00
30017 +/////////////////////////////////////////////////////////////////////////////////////////
30018 +// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
30019 +ENTRY(floating_point_fault)
30020 +       DBG_FAULT(32)
30021 +       FAULT(32)
30022 +END(floating_point_fault)
30023 +
30024 +       .org ia64_ivt+0x5d00
30025 +/////////////////////////////////////////////////////////////////////////////////////////
30026 +// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
30027 +ENTRY(floating_point_trap)
30028 +       DBG_FAULT(33)
30029 +       FAULT(33)
30030 +END(floating_point_trap)
30031 +
30032 +       .org ia64_ivt+0x5e00
30033 +/////////////////////////////////////////////////////////////////////////////////////////
30034 +// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
30035 +ENTRY(lower_privilege_trap)
30036 +       DBG_FAULT(34)
30037 +       FAULT(34)
30038 +END(lower_privilege_trap)
30039 +
30040 +       .org ia64_ivt+0x5f00
30041 +/////////////////////////////////////////////////////////////////////////////////////////
30042 +// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
30043 +ENTRY(taken_branch_trap)
30044 +       DBG_FAULT(35)
30045 +       FAULT(35)
30046 +END(taken_branch_trap)
30047 +
30048 +       .org ia64_ivt+0x6000
30049 +/////////////////////////////////////////////////////////////////////////////////////////
30050 +// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
30051 +ENTRY(single_step_trap)
30052 +       DBG_FAULT(36)
30053 +       FAULT(36)
30054 +END(single_step_trap)
30055 +
30056 +       .org ia64_ivt+0x6100
30057 +/////////////////////////////////////////////////////////////////////////////////////////
30058 +// 0x6100 Entry 37 (size 16 bundles) Reserved
30059 +       DBG_FAULT(37)
30060 +       FAULT(37)
30061 +
30062 +       .org ia64_ivt+0x6200
30063 +/////////////////////////////////////////////////////////////////////////////////////////
30064 +// 0x6200 Entry 38 (size 16 bundles) Reserved
30065 +       DBG_FAULT(38)
30066 +       FAULT(38)
30067 +
30068 +       .org ia64_ivt+0x6300
30069 +/////////////////////////////////////////////////////////////////////////////////////////
30070 +// 0x6300 Entry 39 (size 16 bundles) Reserved
30071 +       DBG_FAULT(39)
30072 +       FAULT(39)
30073 +
30074 +       .org ia64_ivt+0x6400
30075 +/////////////////////////////////////////////////////////////////////////////////////////
30076 +// 0x6400 Entry 40 (size 16 bundles) Reserved
30077 +       DBG_FAULT(40)
30078 +       FAULT(40)
30079 +
30080 +       .org ia64_ivt+0x6500
30081 +/////////////////////////////////////////////////////////////////////////////////////////
30082 +// 0x6500 Entry 41 (size 16 bundles) Reserved
30083 +       DBG_FAULT(41)
30084 +       FAULT(41)
30085 +
30086 +       .org ia64_ivt+0x6600
30087 +/////////////////////////////////////////////////////////////////////////////////////////
30088 +// 0x6600 Entry 42 (size 16 bundles) Reserved
30089 +       DBG_FAULT(42)
30090 +       FAULT(42)
30091 +
30092 +       .org ia64_ivt+0x6700
30093 +/////////////////////////////////////////////////////////////////////////////////////////
30094 +// 0x6700 Entry 43 (size 16 bundles) Reserved
30095 +       DBG_FAULT(43)
30096 +       FAULT(43)
30097 +
30098 +       .org ia64_ivt+0x6800
30099 +/////////////////////////////////////////////////////////////////////////////////////////
30100 +// 0x6800 Entry 44 (size 16 bundles) Reserved
30101 +       DBG_FAULT(44)
30102 +       FAULT(44)
30103 +
30104 +       .org ia64_ivt+0x6900
30105 +/////////////////////////////////////////////////////////////////////////////////////////
30106 +// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
30107 +ENTRY(ia32_exception)
30108 +       DBG_FAULT(45)
30109 +       FAULT(45)
30110 +END(ia32_exception)
30111 +
30112 +       .org ia64_ivt+0x6a00
30113 +/////////////////////////////////////////////////////////////////////////////////////////
30114 +// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
30115 +ENTRY(ia32_intercept)
30116 +       DBG_FAULT(46)
30117 +#ifdef CONFIG_IA32_SUPPORT
30118 +       mov r31=pr
30119 +       mov r16=cr.isr
30120 +       ;;
30121 +       extr.u r17=r16,16,8     // get ISR.code
30122 +       mov r18=ar.eflag
30123 +       mov r19=cr.iim          // old eflag value
30124 +       ;;
30125 +       cmp.ne p6,p0=2,r17
30126 +(p6)   br.cond.spnt 1f         // not a system flag fault
30127 +       xor r16=r18,r19
30128 +       ;;
30129 +       extr.u r17=r16,18,1     // get the eflags.ac bit
30130 +       ;;
30131 +       cmp.eq p6,p0=0,r17
30132 +(p6)   br.cond.spnt 1f         // eflags.ac bit didn't change
30133 +       ;;
30134 +       mov pr=r31,-1           // restore predicate registers
30135 +#ifdef CONFIG_XEN
30136 +       XEN_HYPER_RFI;
30137 +#else
30138 +       rfi
30139 +#endif
30140 +
30141 +1:
30142 +#endif // CONFIG_IA32_SUPPORT
30143 +       FAULT(46)
30144 +END(ia32_intercept)
30145 +
30146 +       .org ia64_ivt+0x6b00
30147 +/////////////////////////////////////////////////////////////////////////////////////////
30148 +// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
30149 +ENTRY(ia32_interrupt)
30150 +       DBG_FAULT(47)
30151 +#ifdef CONFIG_IA32_SUPPORT
30152 +       mov r31=pr
30153 +       br.sptk.many dispatch_to_ia32_handler
30154 +#else
30155 +       FAULT(47)
30156 +#endif
30157 +END(ia32_interrupt)
30158 +
30159 +       .org ia64_ivt+0x6c00
30160 +/////////////////////////////////////////////////////////////////////////////////////////
30161 +// 0x6c00 Entry 48 (size 16 bundles) Reserved
30162 +       DBG_FAULT(48)
30163 +       FAULT(48)
30164 +
30165 +       .org ia64_ivt+0x6d00
30166 +/////////////////////////////////////////////////////////////////////////////////////////
30167 +// 0x6d00 Entry 49 (size 16 bundles) Reserved
30168 +       DBG_FAULT(49)
30169 +       FAULT(49)
30170 +
30171 +       .org ia64_ivt+0x6e00
30172 +/////////////////////////////////////////////////////////////////////////////////////////
30173 +// 0x6e00 Entry 50 (size 16 bundles) Reserved
30174 +       DBG_FAULT(50)
30175 +       FAULT(50)
30176 +
30177 +       .org ia64_ivt+0x6f00
30178 +/////////////////////////////////////////////////////////////////////////////////////////
30179 +// 0x6f00 Entry 51 (size 16 bundles) Reserved
30180 +       DBG_FAULT(51)
30181 +       FAULT(51)
30182 +
30183 +       .org ia64_ivt+0x7000
30184 +/////////////////////////////////////////////////////////////////////////////////////////
30185 +// 0x7000 Entry 52 (size 16 bundles) Reserved
30186 +       DBG_FAULT(52)
30187 +       FAULT(52)
30188 +
30189 +       .org ia64_ivt+0x7100
30190 +/////////////////////////////////////////////////////////////////////////////////////////
30191 +// 0x7100 Entry 53 (size 16 bundles) Reserved
30192 +       DBG_FAULT(53)
30193 +       FAULT(53)
30194 +
30195 +       .org ia64_ivt+0x7200
30196 +/////////////////////////////////////////////////////////////////////////////////////////
30197 +// 0x7200 Entry 54 (size 16 bundles) Reserved
30198 +       DBG_FAULT(54)
30199 +       FAULT(54)
30200 +
30201 +       .org ia64_ivt+0x7300
30202 +/////////////////////////////////////////////////////////////////////////////////////////
30203 +// 0x7300 Entry 55 (size 16 bundles) Reserved
30204 +       DBG_FAULT(55)
30205 +       FAULT(55)
30206 +
30207 +       .org ia64_ivt+0x7400
30208 +/////////////////////////////////////////////////////////////////////////////////////////
30209 +// 0x7400 Entry 56 (size 16 bundles) Reserved
30210 +       DBG_FAULT(56)
30211 +       FAULT(56)
30212 +
30213 +       .org ia64_ivt+0x7500
30214 +/////////////////////////////////////////////////////////////////////////////////////////
30215 +// 0x7500 Entry 57 (size 16 bundles) Reserved
30216 +       DBG_FAULT(57)
30217 +       FAULT(57)
30218 +
30219 +       .org ia64_ivt+0x7600
30220 +/////////////////////////////////////////////////////////////////////////////////////////
30221 +// 0x7600 Entry 58 (size 16 bundles) Reserved
30222 +       DBG_FAULT(58)
30223 +       FAULT(58)
30224 +
30225 +       .org ia64_ivt+0x7700
30226 +/////////////////////////////////////////////////////////////////////////////////////////
30227 +// 0x7700 Entry 59 (size 16 bundles) Reserved
30228 +       DBG_FAULT(59)
30229 +       FAULT(59)
30230 +
30231 +       .org ia64_ivt+0x7800
30232 +/////////////////////////////////////////////////////////////////////////////////////////
30233 +// 0x7800 Entry 60 (size 16 bundles) Reserved
30234 +       DBG_FAULT(60)
30235 +       FAULT(60)
30236 +
30237 +       .org ia64_ivt+0x7900
30238 +/////////////////////////////////////////////////////////////////////////////////////////
30239 +// 0x7900 Entry 61 (size 16 bundles) Reserved
30240 +       DBG_FAULT(61)
30241 +       FAULT(61)
30242 +
30243 +       .org ia64_ivt+0x7a00
30244 +/////////////////////////////////////////////////////////////////////////////////////////
30245 +// 0x7a00 Entry 62 (size 16 bundles) Reserved
30246 +       DBG_FAULT(62)
30247 +       FAULT(62)
30248 +
30249 +       .org ia64_ivt+0x7b00
30250 +/////////////////////////////////////////////////////////////////////////////////////////
30251 +// 0x7b00 Entry 63 (size 16 bundles) Reserved
30252 +       DBG_FAULT(63)
30253 +       FAULT(63)
30254 +
30255 +       .org ia64_ivt+0x7c00
30256 +/////////////////////////////////////////////////////////////////////////////////////////
30257 +// 0x7c00 Entry 64 (size 16 bundles) Reserved
30258 +       DBG_FAULT(64)
30259 +       FAULT(64)
30260 +
30261 +       .org ia64_ivt+0x7d00
30262 +/////////////////////////////////////////////////////////////////////////////////////////
30263 +// 0x7d00 Entry 65 (size 16 bundles) Reserved
30264 +       DBG_FAULT(65)
30265 +       FAULT(65)
30266 +
30267 +       .org ia64_ivt+0x7e00
30268 +/////////////////////////////////////////////////////////////////////////////////////////
30269 +// 0x7e00 Entry 66 (size 16 bundles) Reserved
30270 +       DBG_FAULT(66)
30271 +       FAULT(66)
30272 +
30273 +#ifdef CONFIG_XEN
30274 +       /*
30275 +        * There is no particular reason for this code to be here, other than that
30276 +        * there happens to be space here that would go unused otherwise.  If this
30277 +        * fault ever gets "unreserved", simply moved the following code to a more
30278 +        * suitable spot...
30279 +        */
30280 +
30281 +GLOBAL_ENTRY(xen_bsw1)
30282 +       /* FIXME: THIS CODE IS NOT NaT SAFE! */
30283 +       movl r30=XSI_BANKNUM;
30284 +       mov r31=1;;
30285 +       st4 [r30]=r31;
30286 +       movl r30=XSI_BANK1_R16;
30287 +       movl r31=XSI_BANK1_R16+8;;
30288 +       ld8 r16=[r30],16; ld8 r17=[r31],16;;
30289 +       ld8 r18=[r30],16; ld8 r19=[r31],16;;
30290 +       ld8 r20=[r30],16; ld8 r21=[r31],16;;
30291 +       ld8 r22=[r30],16; ld8 r23=[r31],16;;
30292 +       ld8 r24=[r30],16; ld8 r25=[r31],16;;
30293 +       ld8 r26=[r30],16; ld8 r27=[r31],16;;
30294 +       ld8 r28=[r30],16; ld8 r29=[r31],16;;
30295 +       ld8 r30=[r30]; ld8 r31=[r31];;
30296 +       br.ret.sptk.many b0
30297 +END(xen_bsw1)
30298 +#endif
30299 +
30300 +       .org ia64_ivt+0x7f00
30301 +/////////////////////////////////////////////////////////////////////////////////////////
30302 +// 0x7f00 Entry 67 (size 16 bundles) Reserved
30303 +       DBG_FAULT(67)
30304 +       FAULT(67)
30305 +
30306 +#ifdef CONFIG_IA32_SUPPORT
30307 +
30308 +       /*
30309 +        * There is no particular reason for this code to be here, other than that
30310 +        * there happens to be space here that would go unused otherwise.  If this
30311 +        * fault ever gets "unreserved", simply moved the following code to a more
30312 +        * suitable spot...
30313 +        */
30314 +
30315 +       // IA32 interrupt entry point
30316 +
30317 +ENTRY(dispatch_to_ia32_handler)
30318 +       SAVE_MIN
30319 +       ;;
30320 +       mov r14=cr.isr
30321 +       ssm psr.ic | PSR_DEFAULT_BITS
30322 +       ;;
30323 +       srlz.i                                  // guarantee that interruption collection is on
30324 +       ;;
30325 +(p15)  ssm psr.i
30326 +       adds r3=8,r2            // Base pointer for SAVE_REST
30327 +       ;;
30328 +       SAVE_REST
30329 +       ;;
30330 +       mov r15=0x80
30331 +       shr r14=r14,16          // Get interrupt number
30332 +       ;;
30333 +       cmp.ne p6,p0=r14,r15
30334 +(p6)   br.call.dpnt.many b6=non_ia32_syscall
30335 +
30336 +       adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
30337 +       adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
30338 +       ;;
30339 +       cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
30340 +       ld8 r8=[r14]            // get r8
30341 +       ;;
30342 +       st8 [r15]=r8            // save original EAX in r1 (IA32 procs don't use the GP)
30343 +       ;;
30344 +       alloc r15=ar.pfs,0,0,6,0        // must first in an insn group
30345 +       ;;
30346 +       ld4 r8=[r14],8          // r8 == eax (syscall number)
30347 +       mov r15=IA32_NR_syscalls
30348 +       ;;
30349 +       cmp.ltu.unc p6,p7=r8,r15
30350 +       ld4 out1=[r14],8        // r9 == ecx
30351 +       ;;
30352 +       ld4 out2=[r14],8        // r10 == edx
30353 +       ;;
30354 +       ld4 out0=[r14]          // r11 == ebx
30355 +       adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
30356 +       ;;
30357 +       ld4 out5=[r14],PT(R14)-PT(R13)  // r13 == ebp
30358 +       ;;
30359 +       ld4 out3=[r14],PT(R15)-PT(R14)  // r14 == esi
30360 +       adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
30361 +       ;;
30362 +       ld4 out4=[r14]          // r15 == edi
30363 +       movl r16=ia32_syscall_table
30364 +       ;;
30365 +(p6)   shladd r16=r8,3,r16     // force ni_syscall if not valid syscall number
30366 +       ld4 r2=[r2]             // r2 = current_thread_info()->flags
30367 +       ;;
30368 +       ld8 r16=[r16]
30369 +       and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
30370 +       ;;
30371 +       mov b6=r16
30372 +       movl r15=ia32_ret_from_syscall
30373 +       cmp.eq p8,p0=r2,r0
30374 +       ;;
30375 +       mov rp=r15
30376 +(p8)   br.call.sptk.many b6=b6
30377 +       br.cond.sptk ia32_trace_syscall
30378 +
30379 +non_ia32_syscall:
30380 +       alloc r15=ar.pfs,0,0,2,0
30381 +       mov out0=r14                            // interrupt #
30382 +       add out1=16,sp                          // pointer to pt_regs
30383 +       ;;                      // avoid WAW on CFM
30384 +       br.call.sptk.many rp=ia32_bad_interrupt
30385 +.ret1: movl r15=ia64_leave_kernel
30386 +       ;;
30387 +       mov rp=r15
30388 +       br.ret.sptk.many rp
30389 +END(dispatch_to_ia32_handler)
30390 +#endif /* CONFIG_IA32_SUPPORT */
30391 +
30392 +#ifdef CONFIG_XEN
30393 +       .section .text,"ax"
30394 +GLOBAL_ENTRY(xen_event_callback)
30395 +       mov r31=pr              // prepare to save predicates
30396 +       ;;
30397 +       SAVE_MIN_WITH_COVER     // uses r31; defines r2 and r3
30398 +       ;;
30399 +       movl r3=XSI_PSR_IC
30400 +       mov r14=1
30401 +       ;;
30402 +       st4 [r3]=r14
30403 +       ;;
30404 +       adds r3=8,r2            // set up second base pointer for SAVE_REST
30405 +       srlz.i                  // ensure everybody knows psr.ic is back on
30406 +       ;;
30407 +       SAVE_REST
30408 +       ;;
30409 +1:
30410 +       alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
30411 +       add out0=16,sp          // pass pointer to pt_regs as first arg
30412 +       ;;
30413 +       br.call.sptk.many b0=evtchn_do_upcall
30414 +       ;;
30415 +       movl r20=XSI_PSR_I_ADDR
30416 +       ;;
30417 +       ld8 r20=[r20]
30418 +       ;;
30419 +       adds r20=-1,r20         // vcpu_info->evtchn_upcall_pending
30420 +       ;;
30421 +       ld1 r20=[r20]
30422 +       ;;
30423 +       cmp.ne p6,p0=r20,r0     // if there are pending events, 
30424 +       (p6) br.spnt.few 1b     // call evtchn_do_upcall again.
30425 +       br.sptk.many ia64_leave_kernel   
30426 +END(xen_event_callback)
30427 +#endif
30428 diff -ruNp linux-2.6.19/arch/ia64/xen/xenminstate.h linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenminstate.h
30429 --- linux-2.6.19/arch/ia64/xen/xenminstate.h    1970-01-01 00:00:00.000000000 +0000
30430 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenminstate.h  2007-02-02 19:10:21.000000000 +0000
30431 @@ -0,0 +1,368 @@
30432 +
30433 +#include <asm/cache.h>
30434 +
30435 +#ifdef CONFIG_XEN
30436 +#include "../kernel/entry.h"
30437 +#else
30438 +#include "entry.h"
30439 +#endif
30440 +
30441 +/*
30442 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
30443 + * on interrupts.
30444 + *
30445 + *  On entry:
30446 + *     r1:     pointer to current task (ar.k6)
30447 + */
30448 +#define MINSTATE_START_SAVE_MIN_VIRT                                                           \
30449 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
30450 +       ;;                                                                                      \
30451 +(pUStk)        mov.m r24=ar.rnat;                                                                      \
30452 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base of RBS */               \
30453 +(pKStk) mov r1=sp;                                     /* get sp  */                           \
30454 +       ;;                                                                                      \
30455 +(pUStk) lfetch.fault.excl.nt1 [r22];                                                           \
30456 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
30457 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
30458 +       ;;                                                                                      \
30459 +(pUStk)        mov ar.bspstore=r22;                            /* switch to kernel RBS */              \
30460 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, use sp (r12) */   \
30461 +       ;;                                                                                      \
30462 +(pUStk)        mov r18=ar.bsp;                                                                         \
30463 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
30464 +
30465 +#define MINSTATE_END_SAVE_MIN_VIRT                                                             \
30466 +       bsw.1;                  /* switch back to bank 1 (must be last in insn group) */        \
30467 +       ;;
30468 +
30469 +/*
30470 + * For mca_asm.S we want to access the stack physically since the state is saved before we
30471 + * go virtual and don't want to destroy the iip or ipsr.
30472 + */
30473 +#define MINSTATE_START_SAVE_MIN_PHYS                                                           \
30474 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                                         \
30475 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                                   \
30476 +(pKStk) ld8 r3 = [r3];;                                                                                \
30477 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                                            \
30478 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                                          \
30479 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
30480 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of register backing store */    \
30481 +       ;;                                                                                      \
30482 +(pUStk)        mov r24=ar.rnat;                                                                        \
30483 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
30484 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
30485 +(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel virtual addr of RBS */        \
30486 +       ;;                                                                                      \
30487 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp (r12) */           \
30488 +(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS */                      \
30489 +       ;;                                                                                      \
30490 +(pUStk)        mov r18=ar.bsp;                                                                         \
30491 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
30492 +
30493 +#define MINSTATE_END_SAVE_MIN_PHYS                                                             \
30494 +       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */                  \
30495 +       ;;
30496 +
30497 +#ifdef MINSTATE_VIRT
30498 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT)
30499 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
30500 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
30501 +#endif
30502 +
30503 +#ifdef MINSTATE_PHYS
30504 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
30505 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
30506 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
30507 +#endif
30508 +
30509 +/*
30510 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
30511 + * the minimum state necessary that allows us to turn psr.ic back
30512 + * on.
30513 + *
30514 + * Assumed state upon entry:
30515 + *     psr.ic: off
30516 + *     r31:    contains saved predicates (pr)
30517 + *
30518 + * Upon exit, the state is as follows:
30519 + *     psr.ic: off
30520 + *      r2 = points to &pt_regs.r16
30521 + *      r8 = contents of ar.ccv
30522 + *      r9 = contents of ar.csd
30523 + *     r10 = contents of ar.ssd
30524 + *     r11 = FPSR_DEFAULT
30525 + *     r12 = kernel sp (kernel virtual address)
30526 + *     r13 = points to current task_struct (kernel virtual address)
30527 + *     p15 = TRUE if psr.i is set in cr.ipsr
30528 + *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
30529 + *             preserved
30530 + * CONFIG_XEN note: p6/p7 are not preserved
30531 + *
30532 + * Note that psr.ic is NOT turned on by this macro.  This is so that
30533 + * we can pass interruption state as arguments to a handler.
30534 + */
30535 +#ifdef CONFIG_XEN
30536 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
30537 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
30538 +       mov r27=ar.rsc;                 /* M */                                                 \
30539 +       mov r20=r1;                     /* A */                                                 \
30540 +       mov r25=ar.unat;                /* M */                                                 \
30541 +       /* mov r29=cr.ipsr;             /* M */                                                 \
30542 +       movl r29=XSI_IPSR;;                                                                     \
30543 +       ld8 r29=[r29];;                                                                         \
30544 +       mov r26=ar.pfs;                 /* I */                                                 \
30545 +       /* mov r28=cr.iip;              /* M */                                                 \
30546 +       movl r28=XSI_IIP;;                                                                      \
30547 +       ld8 r28=[r28];;                                                                         \
30548 +       mov r21=ar.fpsr;                /* M */                                                 \
30549 +       COVER;                  /* B;; (or nothing) */                                  \
30550 +       ;;                                                                                      \
30551 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
30552 +       ;;                                                                                      \
30553 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
30554 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
30555 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
30556 +       /* switch from user to kernel RBS: */                                                   \
30557 +       ;;                                                                                      \
30558 +       invala;                         /* M */                                                 \
30559 +       /* SAVE_IFS; /* see xen special handling below */                                               \
30560 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
30561 +       ;;                                                                                      \
30562 +       MINSTATE_START_SAVE_MIN                                                                 \
30563 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
30564 +       adds r16=PT(CR_IPSR),r1;                                                                \
30565 +       ;;                                                                                      \
30566 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
30567 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
30568 +       ;;                                                                                      \
30569 +       lfetch.fault.excl.nt1 [r17];                                                            \
30570 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
30571 +       mov r29=b0                                                                              \
30572 +       ;;                                                                                      \
30573 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
30574 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
30575 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
30576 +       ;;                                                                                      \
30577 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
30578 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
30579 +        ;;                                                                                     \
30580 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
30581 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
30582 +        ;;                                                                                     \
30583 +       /* xen special handling for possibly lazy cover */                                      \
30584 +       movl r8=XSI_INCOMPL_REGFR;                                                              \
30585 +       ;;                                                                                      \
30586 +       ld4 r30=[r8];                                                                           \
30587 +       ;;                                                                                      \
30588 +       /* set XSI_INCOMPL_REGFR 0 */                                                           \
30589 +       st4 [r8]=r0;                                                                            \
30590 +       cmp.eq  p6,p7=r30,r0;                                                                   \
30591 +       ;; /* not sure if this stop bit is necessary */                                         \
30592 +(p6)   adds r8=XSI_PRECOVER_IFS-XSI_INCOMPL_REGFR,r8;                                          \
30593 +(p7)   adds r8=XSI_IFS-XSI_INCOMPL_REGFR,r8;                                                   \
30594 +       ;;                                                                                      \
30595 +       ld8 r30=[r8];                                                                           \
30596 +       ;;                                                                                      \
30597 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
30598 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
30599 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
30600 +       mov r8=ar.ccv;                                                                          \
30601 +       mov r9=ar.csd;                                                                          \
30602 +       mov r10=ar.ssd;                                                                         \
30603 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
30604 +       ;;                                                                                      \
30605 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
30606 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
30607 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
30608 +       ;;                                                                                      \
30609 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
30610 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
30611 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
30612 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
30613 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
30614 +       st8 [r17]=r31,16;       /* save predicates */                                           \
30615 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
30616 +       ;;                                                                                      \
30617 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
30618 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
30619 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
30620 +       ;;                                                                                      \
30621 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
30622 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
30623 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
30624 +       ;;                                                                                      \
30625 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
30626 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
30627 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
30628 +       ;;                                                                                      \
30629 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
30630 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
30631 +       ;;                                                                                      \
30632 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
30633 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
30634 +       ;;                                                                                      \
30635 +       EXTRA;                                                                                  \
30636 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;                                        \
30637 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
30638 +       ;;                                                                                      \
30639 +       movl r1=__gp;           /* establish kernel global pointer */                           \
30640 +       ;;                                                                                      \
30641 +       /* MINSTATE_END_SAVE_MIN */
30642 +#else
30643 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
30644 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
30645 +       mov r27=ar.rsc;                 /* M */                                                 \
30646 +       mov r20=r1;                     /* A */                                                 \
30647 +       mov r25=ar.unat;                /* M */                                                 \
30648 +       mov r29=cr.ipsr;                /* M */                                                 \
30649 +       mov r26=ar.pfs;                 /* I */                                                 \
30650 +       mov r28=cr.iip;                 /* M */                                                 \
30651 +       mov r21=ar.fpsr;                /* M */                                                 \
30652 +       COVER;                          /* B;; (or nothing) */                                  \
30653 +       ;;                                                                                      \
30654 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
30655 +       ;;                                                                                      \
30656 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
30657 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
30658 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
30659 +       /* switch from user to kernel RBS: */                                                   \
30660 +       ;;                                                                                      \
30661 +       invala;                         /* M */                                                 \
30662 +       SAVE_IFS;                                                                               \
30663 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
30664 +       ;;                                                                                      \
30665 +       MINSTATE_START_SAVE_MIN                                                                 \
30666 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
30667 +       adds r16=PT(CR_IPSR),r1;                                                                \
30668 +       ;;                                                                                      \
30669 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
30670 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
30671 +       ;;                                                                                      \
30672 +       lfetch.fault.excl.nt1 [r17];                                                            \
30673 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
30674 +       mov r29=b0                                                                              \
30675 +       ;;                                                                                      \
30676 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
30677 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
30678 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
30679 +       ;;                                                                                      \
30680 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
30681 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
30682 +        ;;                                                                                     \
30683 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
30684 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
30685 +        ;;                                                                                     \
30686 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
30687 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
30688 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
30689 +       mov r8=ar.ccv;                                                                          \
30690 +       mov r9=ar.csd;                                                                          \
30691 +       mov r10=ar.ssd;                                                                         \
30692 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
30693 +       ;;                                                                                      \
30694 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
30695 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
30696 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
30697 +       ;;                                                                                      \
30698 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
30699 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
30700 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
30701 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
30702 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
30703 +       st8 [r17]=r31,16;       /* save predicates */                                           \
30704 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
30705 +       ;;                                                                                      \
30706 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
30707 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
30708 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
30709 +       ;;                                                                                      \
30710 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
30711 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
30712 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
30713 +       ;;                                                                                      \
30714 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
30715 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
30716 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
30717 +       ;;                                                                                      \
30718 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
30719 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
30720 +       ;;                                                                                      \
30721 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
30722 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
30723 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
30724 +       ;;                                                                                      \
30725 +       EXTRA;                                                                                  \
30726 +       movl r1=__gp;           /* establish kernel global pointer */                           \
30727 +       ;;                                                                                      \
30728 +       MINSTATE_END_SAVE_MIN
30729 +#endif
30730 +
30731 +/*
30732 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
30733 + *
30734 + * Assumed state upon entry:
30735 + *     psr.ic: on
30736 + *     r2:     points to &pt_regs.r16
30737 + *     r3:     points to &pt_regs.r17
30738 + *     r8:     contents of ar.ccv
30739 + *     r9:     contents of ar.csd
30740 + *     r10:    contents of ar.ssd
30741 + *     r11:    FPSR_DEFAULT
30742 + *
30743 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
30744 + */
30745 +#define SAVE_REST                              \
30746 +.mem.offset 0,0; st8.spill [r2]=r16,16;                \
30747 +.mem.offset 8,0; st8.spill [r3]=r17,16;                \
30748 +       ;;                                      \
30749 +.mem.offset 0,0; st8.spill [r2]=r18,16;                \
30750 +.mem.offset 8,0; st8.spill [r3]=r19,16;                \
30751 +       ;;                                      \
30752 +.mem.offset 0,0; st8.spill [r2]=r20,16;                \
30753 +.mem.offset 8,0; st8.spill [r3]=r21,16;                \
30754 +       mov r18=b6;                             \
30755 +       ;;                                      \
30756 +.mem.offset 0,0; st8.spill [r2]=r22,16;                \
30757 +.mem.offset 8,0; st8.spill [r3]=r23,16;                \
30758 +       mov r19=b7;                             \
30759 +       ;;                                      \
30760 +.mem.offset 0,0; st8.spill [r2]=r24,16;                \
30761 +.mem.offset 8,0; st8.spill [r3]=r25,16;                \
30762 +       ;;                                      \
30763 +.mem.offset 0,0; st8.spill [r2]=r26,16;                \
30764 +.mem.offset 8,0; st8.spill [r3]=r27,16;                \
30765 +       ;;                                      \
30766 +.mem.offset 0,0; st8.spill [r2]=r28,16;                \
30767 +.mem.offset 8,0; st8.spill [r3]=r29,16;                \
30768 +       ;;                                      \
30769 +.mem.offset 0,0; st8.spill [r2]=r30,16;                \
30770 +.mem.offset 8,0; st8.spill [r3]=r31,32;                \
30771 +       ;;                                      \
30772 +       mov ar.fpsr=r11;        /* M-unit */    \
30773 +       st8 [r2]=r8,8;          /* ar.ccv */    \
30774 +       adds r24=PT(B6)-PT(F7),r3;              \
30775 +       ;;                                      \
30776 +       stf.spill [r2]=f6,32;                   \
30777 +       stf.spill [r3]=f7,32;                   \
30778 +       ;;                                      \
30779 +       stf.spill [r2]=f8,32;                   \
30780 +       stf.spill [r3]=f9,32;                   \
30781 +       ;;                                      \
30782 +       stf.spill [r2]=f10;                     \
30783 +       stf.spill [r3]=f11;                     \
30784 +       adds r25=PT(B7)-PT(F11),r3;             \
30785 +       ;;                                      \
30786 +       st8 [r24]=r18,16;       /* b6 */        \
30787 +       st8 [r25]=r19,16;       /* b7 */        \
30788 +       ;;                                      \
30789 +       st8 [r24]=r9;           /* ar.csd */    \
30790 +       st8 [r25]=r10;          /* ar.ssd */    \
30791 +       ;;
30792 +
30793 +#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
30794 +#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
30795 +#ifdef CONFIG_XEN
30796 +#define SAVE_MIN               break 0;; /* FIXME: non-cover version only for ia32 support? */
30797 +#else
30798 +#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
30799 +#endif
30800 diff -ruNp linux-2.6.19/arch/ia64/xen/xenpal.S linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenpal.S
30801 --- linux-2.6.19/arch/ia64/xen/xenpal.S 1970-01-01 00:00:00.000000000 +0000
30802 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xenpal.S       2007-02-02 19:10:21.000000000 +0000
30803 @@ -0,0 +1,76 @@
30804 +/*
30805 + * ia64/xen/xenpal.S
30806 + *
30807 + * Alternate PAL  routines for Xen.  Heavily leveraged from
30808 + *   ia64/kernel/pal.S
30809 + *
30810 + * Copyright (C) 2005 Hewlett-Packard Co
30811 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
30812 + */
30813 +
30814 +#include <asm/asmmacro.h>
30815 +#include <asm/processor.h>
30816 +
30817 +GLOBAL_ENTRY(xen_pal_call_static)
30818 +       .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
30819 +       alloc loc1 = ar.pfs,5,5,0,0
30820 +#ifdef CONFIG_XEN
30821 +       movl r22=running_on_xen;;
30822 +       ld4 r22=[r22];;
30823 +       cmp.eq p7,p0=r22,r0
30824 +(p7)   br.cond.spnt.many __ia64_pal_call_static;;
30825 +#endif
30826 +       movl loc2 = pal_entry_point
30827 +1:     {
30828 +         mov r28 = in0
30829 +         mov r29 = in1
30830 +         mov r8 = ip
30831 +       }
30832 +       ;;
30833 +       ld8 loc2 = [loc2]               // loc2 <- entry point
30834 +       tbit.nz p6,p7 = in4, 0
30835 +       adds r8 = 1f-1b,r8
30836 +       mov loc4=ar.rsc                 // save RSE configuration
30837 +       ;;
30838 +       mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
30839 +       mov loc3 = psr
30840 +       mov loc0 = rp
30841 +       .body
30842 +       mov r30 = in2
30843 +
30844 +#ifdef CONFIG_XEN
30845 +       // this is low priority for paravirtualization, but is called
30846 +       // from the idle loop so confuses privop counting
30847 +       movl r31=XSI_PSR_IC
30848 +       ;;
30849 +(p6)   st4 [r31]=r0
30850 +       ;;
30851 +(p7)   adds r31=XSI_PSR_I_ADDR_OFS-XSI_PSR_IC_OFS,r31
30852 +(p7)   mov r22=1
30853 +       ;;
30854 +(p7)   ld8 r31=[r31]
30855 +       ;;
30856 +(p7)   st1 [r31]=r22
30857 +       ;;
30858 +       mov r31 = in3
30859 +       mov b7 = loc2
30860 +       ;;
30861 +#else
30862 +(p6)   rsm psr.i | psr.ic
30863 +       mov r31 = in3
30864 +       mov b7 = loc2
30865 +
30866 +(p7)   rsm psr.i
30867 +       ;;
30868 +(p6)   srlz.i
30869 +#endif
30870 +       mov rp = r8
30871 +       br.cond.sptk.many b7
30872 +1:     mov psr.l = loc3
30873 +       mov ar.rsc = loc4               // restore RSE configuration
30874 +       mov ar.pfs = loc1
30875 +       mov rp = loc0
30876 +       ;;
30877 +       srlz.d                          // seralize restoration of psr.l
30878 +       br.ret.sptk.many b0
30879 +END(xen_pal_call_static)
30880 diff -ruNp linux-2.6.19/arch/ia64/xen/xensetup.S linux-2.6.19-xen-3.0.4/arch/ia64/xen/xensetup.S
30881 --- linux-2.6.19/arch/ia64/xen/xensetup.S       1970-01-01 00:00:00.000000000 +0000
30882 +++ linux-2.6.19-xen-3.0.4/arch/ia64/xen/xensetup.S     2007-02-02 19:10:21.000000000 +0000
30883 @@ -0,0 +1,53 @@
30884 +/*
30885 + * Support routines for Xen
30886 + *
30887 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
30888 + */
30889 +
30890 +#include <asm/processor.h>
30891 +#include <asm/asmmacro.h>
30892 +
30893 +#define isBP   p3      // are we the Bootstrap Processor?
30894 +
30895 +       .text
30896 +GLOBAL_ENTRY(early_xen_setup)
30897 +       mov r8=ar.rsc           // Initialized in head.S
30898 +(isBP) movl r9=running_on_xen;;
30899 +       extr.u r8=r8,2,2;;      // Extract pl fields
30900 +       cmp.eq p7,p0=r8,r0      // p7: !running on xen
30901 +       mov r8=1                // booleanize.
30902 +(p7)   br.ret.sptk.many rp;;
30903 +(isBP) st4 [r9]=r8
30904 +       movl r10=xen_ivt;;
30905 +       
30906 +       mov cr.iva=r10
30907 +
30908 +       /* Set xsi base.  */
30909 +#define FW_HYPERCALL_SET_SHARED_INFO_VA                        0x600
30910 +(isBP) mov r2=FW_HYPERCALL_SET_SHARED_INFO_VA
30911 +(isBP) movl r28=XSI_BASE;;
30912 +(isBP) break 0x1000;;
30913 +
30914 +       br.ret.sptk.many rp
30915 +       ;;
30916 +END(early_xen_setup)
30917 +
30918 +#include <xen/interface/xen.h>
30919 +
30920 +/* Stub for suspend.
30921 +   Just force the stacked registers to be written in memory.  */       
30922 +GLOBAL_ENTRY(xencomm_arch_hypercall_suspend)
30923 +       mov r15=r32
30924 +       ;; 
30925 +       alloc r20=ar.pfs,0,0,0,0
30926 +       mov r2=__HYPERVISOR_sched_op
30927 +       ;; 
30928 +       /* We don't want to deal with RSE.  */
30929 +       flushrs
30930 +       mov r14=2 // SCHEDOP_shutdown
30931 +       ;;
30932 +       break 0x1000
30933 +       ;; 
30934 +       mov ar.pfs=r20
30935 +       br.ret.sptk.many b0
30936 +END(xencomm_arch_hypercall_suspend)
30937 diff -ruNp linux-2.6.19/arch/um/kernel/physmem.c linux-2.6.19-xen-3.0.4/arch/um/kernel/physmem.c
30938 --- linux-2.6.19/arch/um/kernel/physmem.c       2006-11-29 21:57:37.000000000 +0000
30939 +++ linux-2.6.19-xen-3.0.4/arch/um/kernel/physmem.c     2007-02-02 19:10:26.000000000 +0000
30940 @@ -226,7 +226,7 @@ EXPORT_SYMBOL(physmem_forget_descriptor)
30941  EXPORT_SYMBOL(physmem_remove_mapping);
30942  EXPORT_SYMBOL(physmem_subst_mapping);
30943  
30944 -void arch_free_page(struct page *page, int order)
30945 +int arch_free_page(struct page *page, int order)
30946  {
30947         void *virt;
30948         int i;
30949 @@ -235,6 +235,8 @@ void arch_free_page(struct page *page, i
30950                 virt = __va(page_to_phys(page + i));
30951                 physmem_remove_mapping(virt);
30952         }
30953 +
30954 +       return 0;
30955  }
30956  
30957  int is_remapped(void *virt)
30958 diff -ruNp linux-2.6.19/arch/x86_64/Kconfig linux-2.6.19-xen-3.0.4/arch/x86_64/Kconfig
30959 --- linux-2.6.19/arch/x86_64/Kconfig    2006-11-29 21:57:37.000000000 +0000
30960 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/Kconfig  2007-02-02 19:10:26.000000000 +0000
30961 @@ -34,6 +34,7 @@ config LOCKDEP_SUPPORT
30962  
30963  config STACKTRACE_SUPPORT
30964         bool
30965 +       depends on !X86_64_XEN
30966         default y
30967  
30968  config SEMAPHORE_SLEEPERS
30969 @@ -143,6 +144,22 @@ config GENERIC_CPU
30970  
30971  endchoice
30972  
30973 +config X86_64_XEN
30974 +       bool "Enable Xen compatible kernel"
30975 +       select SWIOTLB
30976 +       help
30977 +         This option will compile a kernel compatible with Xen hypervisor
30978 +
30979 +config X86_NO_TSS
30980 +       bool
30981 +       depends on X86_64_XEN
30982 +       default y
30983 +
30984 +config X86_NO_IDT
30985 +       bool
30986 +       depends on X86_64_XEN
30987 +       default y
30988 +
30989  #
30990  # Define implied options from the CPU selection here
30991  #
30992 @@ -163,6 +180,7 @@ config X86_INTERNODE_CACHE_BYTES
30993  
30994  config X86_TSC
30995         bool
30996 +       depends on !X86_64_XEN
30997         default y
30998  
30999  config X86_GOOD_APIC
31000 @@ -211,7 +229,7 @@ config X86_CPUID
31001  
31002  config X86_HT
31003         bool
31004 -       depends on SMP && !MK8
31005 +       depends on SMP && !MK8 && !X86_64_XEN
31006         default y
31007  
31008  config MATH_EMULATION
31009 @@ -225,14 +243,22 @@ config EISA
31010  
31011  config X86_IO_APIC
31012         bool
31013 +       depends !XEN_UNPRIVILEGED_GUEST
31014         default y
31015  
31016 +config X86_XEN_GENAPIC
31017 +       bool
31018 +       depends X86_64_XEN
31019 +       default XEN_PRIVILEGED_GUEST || SMP
31020 +
31021  config X86_LOCAL_APIC
31022         bool
31023 +       depends !XEN_UNPRIVILEGED_GUEST
31024         default y
31025  
31026  config MTRR
31027         bool "MTRR (Memory Type Range Register) support"
31028 +       depends on !XEN_UNPRIVILEGED_GUEST
31029         ---help---
31030           On Intel P6 family processors (Pentium Pro, Pentium II and later)
31031           the Memory Type Range Registers (MTRRs) may be used to control
31032 @@ -273,7 +299,7 @@ config SMP
31033  
31034  config SCHED_SMT
31035         bool "SMT (Hyperthreading) scheduler support"
31036 -       depends on SMP
31037 +       depends on SMP && !X86_64_XEN
31038         default n
31039         help
31040           SMT scheduler support improves the CPU scheduler's decision making
31041 @@ -283,7 +309,7 @@ config SCHED_SMT
31042  
31043  config SCHED_MC
31044         bool "Multi-core scheduler support"
31045 -       depends on SMP
31046 +       depends on SMP && !X86_64_XEN
31047         default y
31048         help
31049           Multi-core scheduler support improves the CPU scheduler's decision
31050 @@ -294,7 +320,7 @@ source "kernel/Kconfig.preempt"
31051  
31052  config NUMA
31053         bool "Non Uniform Memory Access (NUMA) Support"
31054 -       depends on SMP
31055 +       depends on SMP && !X86_64_XEN
31056         help
31057          Enable NUMA (Non Uniform Memory Access) support. The kernel 
31058          will try to allocate memory used by a CPU on the local memory 
31059 @@ -355,7 +381,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
31060  
31061  config ARCH_SPARSEMEM_ENABLE
31062         def_bool y
31063 -       depends on (NUMA || EXPERIMENTAL)
31064 +       depends on (NUMA || EXPERIMENTAL) && !X86_64_XEN
31065  
31066  config ARCH_MEMORY_PROBE
31067         def_bool y
31068 @@ -383,6 +409,7 @@ config NR_CPUS
31069         int "Maximum number of CPUs (2-256)"
31070         range 2 255
31071         depends on SMP
31072 +       default "16" if X86_64_XEN
31073         default "8"
31074         help
31075           This allows you to specify the maximum number of CPUs which this
31076 @@ -405,6 +432,7 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
31077  
31078  config HPET_TIMER
31079         bool
31080 +       depends on !X86_64_XEN
31081         default y
31082         help
31083           Use the IA-PC HPET (High Precision Event Timer) to manage
31084 @@ -425,7 +453,7 @@ config IOMMU
31085         default y
31086         select SWIOTLB
31087         select AGP
31088 -       depends on PCI
31089 +       depends on PCI && !X86_64_XEN
31090         help
31091           Support for full DMA access of devices with 32bit memory access only
31092           on systems with more than 3GB. This is usually needed for USB,
31093 @@ -461,6 +489,7 @@ config SWIOTLB
31094  
31095  config X86_MCE
31096         bool "Machine check support" if EMBEDDED
31097 +       depends on !X86_64_XEN
31098         default y
31099         help
31100            Include a machine check error handler to report hardware errors.
31101 @@ -486,6 +515,7 @@ config X86_MCE_AMD
31102  
31103  config KEXEC
31104         bool "kexec system call"
31105 +       depends on !XEN_UNPRIVILEGED_GUEST
31106         help
31107           kexec is a system call that implements the ability to shutdown your
31108           current kernel, and to start another kernel.  It is like a reboot
31109 @@ -611,8 +641,11 @@ config GENERIC_PENDING_IRQ
31110         default y
31111  
31112  menu "Power management options"
31113 +       depends on !XEN_UNPRIVILEGED_GUEST
31114  
31115 +if !X86_64_XEN
31116  source kernel/power/Kconfig
31117 +endif
31118  
31119  source "drivers/acpi/Kconfig"
31120  
31121 @@ -635,6 +668,21 @@ config PCI_MMCONFIG
31122         bool "Support mmconfig PCI config space access"
31123         depends on PCI && ACPI
31124  
31125 +config XEN_PCIDEV_FRONTEND
31126 +       bool "Xen PCI Frontend"
31127 +       depends on PCI && X86_64_XEN
31128 +       default y
31129 +       help
31130 +         The PCI device frontend driver allows the kernel to import arbitrary
31131 +         PCI devices from a PCI backend to support PCI driver domains.
31132 +
31133 +config XEN_PCIDEV_FE_DEBUG
31134 +       bool "Xen PCI Frontend Debugging"
31135 +       depends on XEN_PCIDEV_FRONTEND
31136 +       default n
31137 +       help
31138 +         Enables some debug statements within the PCI Frontend.
31139 +
31140  source "drivers/pci/pcie/Kconfig"
31141  
31142  source "drivers/pci/Kconfig"
31143 @@ -705,4 +753,6 @@ source "security/Kconfig"
31144  
31145  source "crypto/Kconfig"
31146  
31147 +source "drivers/xen/Kconfig"
31148 +
31149  source "lib/Kconfig"
31150 diff -ruNp linux-2.6.19/arch/x86_64/Makefile linux-2.6.19-xen-3.0.4/arch/x86_64/Makefile
31151 --- linux-2.6.19/arch/x86_64/Makefile   2006-11-29 21:57:37.000000000 +0000
31152 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/Makefile 2007-02-02 19:10:26.000000000 +0000
31153 @@ -32,6 +32,10 @@ cflags-$(CONFIG_MK8) += $(call cc-option
31154  cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
31155  cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
31156  
31157 +cppflags-$(CONFIG_XEN) += \
31158 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
31159 +CPPFLAGS += $(cppflags-y)
31160 +
31161  cflags-y += -m64
31162  cflags-y += -mno-red-zone
31163  cflags-y += -mcmodel=kernel
31164 @@ -88,6 +92,21 @@ boot := arch/x86_64/boot
31165  PHONY += bzImage bzlilo install archmrproper \
31166          fdimage fdimage144 fdimage288 isoimage archclean
31167  
31168 +ifdef CONFIG_XEN
31169 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
31170 +head-y := arch/x86_64/kernel/head-xen.o arch/x86_64/kernel/head64-xen.o arch/x86_64/kernel/init_task.o
31171 +LDFLAGS_vmlinux := -e _start
31172 +boot := arch/i386/boot-xen
31173 +.PHONY: vmlinuz
31174 +#Default target when executing "make"
31175 +all: vmlinuz
31176 +
31177 +vmlinuz: vmlinux
31178 +       $(Q)$(MAKE) $(build)=$(boot) $@
31179 +
31180 +install:
31181 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
31182 +else
31183  #Default target when executing "make"
31184  all: bzImage
31185  
31186 @@ -108,6 +127,7 @@ fdimage fdimage144 fdimage288 isoimage: 
31187  
31188  install:
31189         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ 
31190 +endif
31191  
31192  archclean:
31193         $(Q)$(MAKE) $(clean)=$(boot)
31194 diff -ruNp linux-2.6.19/arch/x86_64/ia32/Makefile linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/Makefile
31195 --- linux-2.6.19/arch/x86_64/ia32/Makefile      2006-11-29 21:57:37.000000000 +0000
31196 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/Makefile    2007-02-02 19:10:26.000000000 +0000
31197 @@ -27,9 +27,25 @@ quiet_cmd_syscall = SYSCALL $@
31198                            -Wl,-soname=linux-gate.so.1 -o $@ \
31199                            -Wl,-T,$(filter-out FORCE,$^)
31200  
31201 +$(obj)/vsyscall-int80.so \
31202  $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
31203  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
31204         $(call if_changed,syscall)
31205  
31206 -AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
31207 -AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
31208 +AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -Iarch/i386/kernel
31209 +AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 -Iarch/i386/kernel
31210 +
31211 +ifdef CONFIG_XEN
31212 +AFLAGS_vsyscall-int80.o = -m32 -Wa,-32 -Iarch/i386/kernel
31213 +CFLAGS_syscall32-xen.o += -DUSE_INT80
31214 +AFLAGS_syscall32_syscall-xen.o += -DUSE_INT80
31215 +
31216 +$(obj)/syscall32_syscall-xen.o: \
31217 +       $(foreach F,int80 sysenter syscall,$(obj)/vsyscall-$F.so)
31218 +
31219 +targets := $(foreach F,int80 sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
31220 +
31221 +include $(srctree)/scripts/Makefile.xen
31222 +
31223 +obj-y := $(call cherrypickxen, $(obj-y))
31224 +endif
31225 diff -ruNp linux-2.6.19/arch/x86_64/ia32/ia32entry-xen.S linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/ia32entry-xen.S
31226 --- linux-2.6.19/arch/x86_64/ia32/ia32entry-xen.S       1970-01-01 00:00:00.000000000 +0000
31227 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/ia32entry-xen.S     2007-02-02 19:10:26.000000000 +0000
31228 @@ -0,0 +1,743 @@
31229 +/*
31230 + * Compatibility mode system call entry point for x86-64. 
31231 + *             
31232 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
31233 + */             
31234 +
31235 +#include <asm/dwarf2.h>
31236 +#include <asm/calling.h>
31237 +#include <asm/asm-offsets.h>
31238 +#include <asm/current.h>
31239 +#include <asm/errno.h>
31240 +#include <asm/ia32_unistd.h>   
31241 +#include <asm/thread_info.h>   
31242 +#include <asm/segment.h>
31243 +#include <asm/vsyscall32.h>
31244 +#include <asm/irqflags.h>
31245 +#include <linux/linkage.h>
31246 +
31247 +#define __XEN_X86_64 1
31248 +
31249 +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
31250 +
31251 +       .macro IA32_ARG_FIXUP noebp=0
31252 +       movl    %edi,%r8d
31253 +       .if \noebp
31254 +       .else
31255 +       movl    %ebp,%r9d
31256 +       .endif
31257 +       xchg    %ecx,%esi
31258 +       movl    %ebx,%edi
31259 +       movl    %edx,%edx       /* zero extension */
31260 +       .endm 
31261 +
31262 +       /* clobbers %eax */     
31263 +       .macro  CLEAR_RREGS
31264 +       xorl    %eax,%eax
31265 +       movq    %rax,R11(%rsp)
31266 +       movq    %rax,R10(%rsp)
31267 +       movq    %rax,R9(%rsp)
31268 +       movq    %rax,R8(%rsp)
31269 +       .endm
31270 +
31271 +#if defined (__XEN_X86_64)
31272 +#include "../kernel/xen_entry.S"
31273 +               
31274 +#define        __swapgs
31275 +#define __cli
31276 +#define __sti  
31277 +#else
31278 +/*
31279 + * Use the native instructions
31280 + */    
31281 +#define        __swapgs        swapgs
31282 +#define __cli          cli
31283 +#define __sti          sti     
31284 +#endif                 
31285 +
31286 +       .macro CFI_STARTPROC32 simple
31287 +       CFI_STARTPROC   \simple
31288 +       CFI_UNDEFINED   r8
31289 +       CFI_UNDEFINED   r9
31290 +       CFI_UNDEFINED   r10
31291 +       CFI_UNDEFINED   r11
31292 +       CFI_UNDEFINED   r12
31293 +       CFI_UNDEFINED   r13
31294 +       CFI_UNDEFINED   r14
31295 +       CFI_UNDEFINED   r15
31296 +       .endm
31297 +
31298 +/*
31299 + * 32bit SYSENTER instruction entry.
31300 + *
31301 + * Arguments:
31302 + * %eax        System call number.
31303 + * %ebx Arg1
31304 + * %ecx Arg2
31305 + * %edx Arg3
31306 + * %esi Arg4
31307 + * %edi Arg5
31308 + * %ebp user stack
31309 + * 0(%ebp) Arg6        
31310 + *     
31311 + * Interrupts off.
31312 + *     
31313 + * This is purely a fast path. For anything complicated we use the int 0x80
31314 + * path below. Set up a complete hardware stack frame to share code
31315 + * with the int 0x80 path.
31316 + */    
31317 +ENTRY(ia32_sysenter_target)
31318 +       CFI_STARTPROC32 simple
31319 +       CFI_DEF_CFA     rsp,0
31320 +       CFI_REGISTER    rsp,rbp
31321 +       __swapgs 
31322 +       movq    %gs:pda_kernelstack, %rsp
31323 +       addq    $(PDA_STACKOFFSET),%rsp 
31324 +       /*
31325 +        * No need to follow this irqs on/off section: the syscall
31326 +        * disabled irqs, here we enable it straight after entry:
31327 +        */
31328 +       XEN_UNBLOCK_EVENTS(%r11)        
31329 +       __sti
31330 +       movl    %ebp,%ebp               /* zero extension */
31331 +       pushq   $__USER32_DS
31332 +       CFI_ADJUST_CFA_OFFSET 8
31333 +       /*CFI_REL_OFFSET ss,0*/
31334 +       pushq   %rbp
31335 +       CFI_ADJUST_CFA_OFFSET 8
31336 +       CFI_REL_OFFSET rsp,0
31337 +       pushfq
31338 +       CFI_ADJUST_CFA_OFFSET 8
31339 +       /*CFI_REL_OFFSET rflags,0*/
31340 +       movl    $VSYSCALL32_SYSEXIT, %r10d
31341 +       CFI_REGISTER rip,r10
31342 +       pushq   $__USER32_CS
31343 +       CFI_ADJUST_CFA_OFFSET 8
31344 +       /*CFI_REL_OFFSET cs,0*/
31345 +       movl    %eax, %eax
31346 +       pushq   %r10
31347 +       CFI_ADJUST_CFA_OFFSET 8
31348 +       CFI_REL_OFFSET rip,0
31349 +       pushq   %rax
31350 +       CFI_ADJUST_CFA_OFFSET 8
31351 +       cld
31352 +       SAVE_ARGS 0,0,0
31353 +       /* no need to do an access_ok check here because rbp has been
31354 +          32bit zero extended */ 
31355 +1:     movl    (%rbp),%r9d
31356 +       .section __ex_table,"a"
31357 +       .quad 1b,ia32_badarg
31358 +       .previous       
31359 +       GET_THREAD_INFO(%r10)
31360 +       orl    $TS_COMPAT,threadinfo_status(%r10)
31361 +       testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31362 +       CFI_REMEMBER_STATE
31363 +       jnz  sysenter_tracesys
31364 +sysenter_do_call:      
31365 +       cmpl    $(IA32_NR_syscalls-1),%eax
31366 +       ja      ia32_badsys
31367 +       IA32_ARG_FIXUP 1
31368 +       call    *ia32_sys_call_table(,%rax,8)
31369 +       movq    %rax,RAX-ARGOFFSET(%rsp)
31370 +       GET_THREAD_INFO(%r10)
31371 +       XEN_BLOCK_EVENTS(%r11)  
31372 +       __cli
31373 +       TRACE_IRQS_OFF
31374 +       testl   $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
31375 +       jnz     int_ret_from_sys_call
31376 +       andl    $~TS_COMPAT,threadinfo_status(%r10)
31377 +       /* clear IF, that popfq doesn't enable interrupts early */
31378 +       andl  $~0x200,EFLAGS-R11(%rsp) 
31379 +       RESTORE_ARGS 1,24,1,1,1,1
31380 +       popfq
31381 +       CFI_ADJUST_CFA_OFFSET -8
31382 +       /*CFI_RESTORE rflags*/
31383 +       popq    %rcx                            /* User %esp */
31384 +       CFI_ADJUST_CFA_OFFSET -8
31385 +       CFI_REGISTER rsp,rcx
31386 +       movl    $VSYSCALL32_SYSEXIT,%edx        /* User %eip */
31387 +       CFI_REGISTER rip,rdx
31388 +       TRACE_IRQS_ON
31389 +       __swapgs
31390 +       XEN_UNBLOCK_EVENTS(%r11)                
31391 +       __sti           /* sti only takes effect after the next instruction */
31392 +       /* sysexit */
31393 +       .byte   0xf, 0x35
31394 +
31395 +sysenter_tracesys:
31396 +       CFI_RESTORE_STATE
31397 +       SAVE_REST
31398 +       CLEAR_RREGS
31399 +       movq    $-ENOSYS,RAX(%rsp)      /* really needed? */
31400 +       movq    %rsp,%rdi        /* &pt_regs -> arg1 */
31401 +       call    syscall_trace_enter
31402 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
31403 +       RESTORE_REST
31404 +       movl    %ebp, %ebp
31405 +       /* no need to do an access_ok check here because rbp has been
31406 +          32bit zero extended */ 
31407 +1:     movl    (%rbp),%r9d
31408 +       .section __ex_table,"a"
31409 +       .quad 1b,ia32_badarg
31410 +       .previous
31411 +       jmp     sysenter_do_call
31412 +       CFI_ENDPROC
31413 +ENDPROC(ia32_sysenter_target)
31414 +
31415 +/*
31416 + * 32bit SYSCALL instruction entry.
31417 + *
31418 + * Arguments:
31419 + * %eax        System call number.
31420 + * %ebx Arg1
31421 + * %ecx return EIP 
31422 + * %edx Arg3
31423 + * %esi Arg4
31424 + * %edi Arg5
31425 + * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
31426 + * %esp user stack 
31427 + * 0(%esp) Arg6
31428 + *     
31429 + * Interrupts off.
31430 + *     
31431 + * This is purely a fast path. For anything complicated we use the int 0x80
31432 + * path below. Set up a complete hardware stack frame to share code
31433 + * with the int 0x80 path.     
31434 + */    
31435 +ENTRY(ia32_cstar_target)
31436 +       CFI_STARTPROC32 simple
31437 +       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
31438 +       CFI_REGISTER    rip,rcx
31439 +       /*CFI_REGISTER  rflags,r11*/
31440 +       __swapgs
31441 +       movl    %esp,%r8d
31442 +       CFI_REGISTER    rsp,r8
31443 +       movq    %gs:pda_kernelstack,%rsp
31444 +       /*
31445 +        * No need to follow this irqs on/off section: the syscall
31446 +        * disabled irqs and here we enable it straight after entry:
31447 +        */
31448 +       XEN_UNBLOCK_EVENTS(%r11)        
31449 +       __sti
31450 +       SAVE_ARGS 8,1,1
31451 +       movl    %eax,%eax       /* zero extension */
31452 +       movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
31453 +       movq    %rcx,RIP-ARGOFFSET(%rsp)
31454 +       CFI_REL_OFFSET rip,RIP-ARGOFFSET
31455 +       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
31456 +       movl    %ebp,%ecx
31457 +       movq    $__USER32_CS,CS-ARGOFFSET(%rsp)
31458 +       movq    $__USER32_DS,SS-ARGOFFSET(%rsp)
31459 +       movq    %r11,EFLAGS-ARGOFFSET(%rsp)
31460 +       /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
31461 +       movq    %r8,RSP-ARGOFFSET(%rsp) 
31462 +       CFI_REL_OFFSET rsp,RSP-ARGOFFSET
31463 +       /* no need to do an access_ok check here because r8 has been
31464 +          32bit zero extended */ 
31465 +       /* hardware stack frame is complete now */      
31466 +1:     movl    (%r8),%r9d
31467 +       .section __ex_table,"a"
31468 +       .quad 1b,ia32_badarg
31469 +       .previous       
31470 +       GET_THREAD_INFO(%r10)
31471 +       orl   $TS_COMPAT,threadinfo_status(%r10)
31472 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31473 +       CFI_REMEMBER_STATE
31474 +       jnz   cstar_tracesys
31475 +cstar_do_call: 
31476 +       cmpl $IA32_NR_syscalls-1,%eax
31477 +       ja  ia32_badsys
31478 +       IA32_ARG_FIXUP 1
31479 +       call *ia32_sys_call_table(,%rax,8)
31480 +       movq %rax,RAX-ARGOFFSET(%rsp)
31481 +       GET_THREAD_INFO(%r10)
31482 +       XEN_BLOCK_EVENTS(%r11)          
31483 +       __cli
31484 +       TRACE_IRQS_OFF
31485 +       testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
31486 +       jnz  int_ret_from_sys_call
31487 +       andl $~TS_COMPAT,threadinfo_status(%r10)
31488 +       RESTORE_ARGS 1,-ARG_SKIP,1,1,1
31489 +       movl RIP-ARGOFFSET(%rsp),%ecx
31490 +       CFI_REGISTER rip,rcx
31491 +       movl EFLAGS-ARGOFFSET(%rsp),%r11d       
31492 +       /*CFI_REGISTER rflags,r11*/
31493 +       TRACE_IRQS_ON
31494 +       movl RSP-ARGOFFSET(%rsp),%esp
31495 +       CFI_RESTORE rsp
31496 +       __swapgs
31497 +       sysretl
31498 +       
31499 +cstar_tracesys:        
31500 +       CFI_RESTORE_STATE
31501 +       SAVE_REST
31502 +       CLEAR_RREGS
31503 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
31504 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
31505 +       call syscall_trace_enter
31506 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
31507 +       RESTORE_REST
31508 +       movl RSP-ARGOFFSET(%rsp), %r8d
31509 +       /* no need to do an access_ok check here because r8 has been
31510 +          32bit zero extended */ 
31511 +1:     movl    (%r8),%r9d
31512 +       .section __ex_table,"a"
31513 +       .quad 1b,ia32_badarg
31514 +       .previous
31515 +       jmp cstar_do_call
31516 +END(ia32_cstar_target)
31517 +                               
31518 +ia32_badarg:
31519 +       movq $-EFAULT,%rax
31520 +       jmp ia32_sysret
31521 +       CFI_ENDPROC
31522 +
31523 +/* 
31524 + * Emulated IA32 system calls via int 0x80. 
31525 + *
31526 + * Arguments:   
31527 + * %eax        System call number.
31528 + * %ebx Arg1
31529 + * %ecx Arg2
31530 + * %edx Arg3
31531 + * %esi Arg4
31532 + * %edi Arg5
31533 + * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
31534 + *
31535 + * Notes:
31536 + * Uses the same stack frame as the x86-64 version.    
31537 + * All registers except %eax must be saved (but ptrace may violate that)
31538 + * Arguments are zero extended. For system calls that want sign extension and
31539 + * take long arguments a wrapper is needed. Most calls can just be called
31540 + * directly.
31541 + * Assumes it is only called from user space and entered with interrupts off.  
31542 + */                            
31543 +
31544 +ENTRY(ia32_syscall)
31545 +       CFI_STARTPROC   simple
31546 +       CFI_DEF_CFA     rsp,SS+8-RIP
31547 +       /*CFI_REL_OFFSET        ss,SS-RIP*/
31548 +       CFI_REL_OFFSET  rsp,RSP-RIP
31549 +       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
31550 +       /*CFI_REL_OFFSET        cs,CS-RIP*/
31551 +       CFI_REL_OFFSET  rip,RIP-RIP
31552 +       __swapgs
31553 +       /*
31554 +        * No need to follow this irqs on/off section: the syscall
31555 +        * disabled irqs and here we enable it straight after entry:
31556 +        */
31557 +       XEN_UNBLOCK_EVENTS(%r11)
31558 +       __sti
31559 +       movq (%rsp),%rcx
31560 +       movq 8(%rsp),%r11
31561 +        addq $0x10,%rsp /* skip rcx and r11 */
31562 +       movl %eax,%eax
31563 +       pushq %rax
31564 +       CFI_ADJUST_CFA_OFFSET 8
31565 +       cld
31566 +       /* note the registers are not zero extended to the sf.
31567 +          this could be a problem. */
31568 +       SAVE_ARGS 0,0,1
31569 +       GET_THREAD_INFO(%r10)
31570 +       orl   $TS_COMPAT,threadinfo_status(%r10)
31571 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31572 +       jnz ia32_tracesys
31573 +ia32_do_syscall:       
31574 +       cmpl $(IA32_NR_syscalls-1),%eax
31575 +       ja  ia32_badsys
31576 +       IA32_ARG_FIXUP
31577 +       call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
31578 +ia32_sysret:
31579 +       movq %rax,RAX-ARGOFFSET(%rsp)
31580 +       jmp int_ret_from_sys_call 
31581 +
31582 +ia32_tracesys:                  
31583 +       SAVE_REST
31584 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
31585 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
31586 +       call syscall_trace_enter
31587 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
31588 +       RESTORE_REST
31589 +       jmp ia32_do_syscall
31590 +END(ia32_syscall)
31591 +
31592 +ia32_badsys:
31593 +       movq $0,ORIG_RAX-ARGOFFSET(%rsp)
31594 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
31595 +       jmp int_ret_from_sys_call
31596 +
31597 +quiet_ni_syscall:
31598 +       movq $-ENOSYS,%rax
31599 +       ret
31600 +       CFI_ENDPROC
31601 +       
31602 +       .macro PTREGSCALL label, func, arg
31603 +       .globl \label
31604 +\label:
31605 +       leaq \func(%rip),%rax
31606 +       leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
31607 +       jmp  ia32_ptregs_common 
31608 +       .endm
31609 +
31610 +       CFI_STARTPROC32
31611 +
31612 +       PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
31613 +       PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
31614 +       PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
31615 +       PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
31616 +       PTREGSCALL stub32_execve, sys32_execve, %rcx
31617 +       PTREGSCALL stub32_fork, sys_fork, %rdi
31618 +       PTREGSCALL stub32_clone, sys32_clone, %rdx
31619 +       PTREGSCALL stub32_vfork, sys_vfork, %rdi
31620 +       PTREGSCALL stub32_iopl, sys_iopl, %rsi
31621 +       PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
31622 +
31623 +ENTRY(ia32_ptregs_common)
31624 +       popq %r11
31625 +       CFI_ENDPROC
31626 +       CFI_STARTPROC32 simple
31627 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
31628 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
31629 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
31630 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
31631 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
31632 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
31633 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
31634 +/*     CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
31635 +/*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
31636 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
31637 +/*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
31638 +       SAVE_REST
31639 +       call *%rax
31640 +       RESTORE_REST
31641 +       jmp  ia32_sysret        /* misbalances the return cache */
31642 +       CFI_ENDPROC
31643 +END(ia32_ptregs_common)
31644 +
31645 +       .section .rodata,"a"
31646 +       .align 8
31647 +ia32_sys_call_table:
31648 +       .quad sys_restart_syscall
31649 +       .quad sys_exit
31650 +       .quad stub32_fork
31651 +       .quad sys_read
31652 +       .quad sys_write
31653 +       .quad compat_sys_open           /* 5 */
31654 +       .quad sys_close
31655 +       .quad sys32_waitpid
31656 +       .quad sys_creat
31657 +       .quad sys_link
31658 +       .quad sys_unlink                /* 10 */
31659 +       .quad stub32_execve
31660 +       .quad sys_chdir
31661 +       .quad compat_sys_time
31662 +       .quad sys_mknod
31663 +       .quad sys_chmod         /* 15 */
31664 +       .quad sys_lchown16
31665 +       .quad quiet_ni_syscall                  /* old break syscall holder */
31666 +       .quad sys_stat
31667 +       .quad sys32_lseek
31668 +       .quad sys_getpid                /* 20 */
31669 +       .quad compat_sys_mount  /* mount  */
31670 +       .quad sys_oldumount     /* old_umount  */
31671 +       .quad sys_setuid16
31672 +       .quad sys_getuid16
31673 +       .quad compat_sys_stime  /* stime */             /* 25 */
31674 +       .quad sys32_ptrace      /* ptrace */
31675 +       .quad sys_alarm
31676 +       .quad sys_fstat /* (old)fstat */
31677 +       .quad sys_pause
31678 +       .quad compat_sys_utime  /* 30 */
31679 +       .quad quiet_ni_syscall  /* old stty syscall holder */
31680 +       .quad quiet_ni_syscall  /* old gtty syscall holder */
31681 +       .quad sys_access
31682 +       .quad sys_nice  
31683 +       .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
31684 +       .quad sys_sync
31685 +       .quad sys32_kill
31686 +       .quad sys_rename
31687 +       .quad sys_mkdir
31688 +       .quad sys_rmdir         /* 40 */
31689 +       .quad sys_dup
31690 +       .quad sys32_pipe
31691 +       .quad compat_sys_times
31692 +       .quad quiet_ni_syscall                  /* old prof syscall holder */
31693 +       .quad sys_brk           /* 45 */
31694 +       .quad sys_setgid16
31695 +       .quad sys_getgid16
31696 +       .quad sys_signal
31697 +       .quad sys_geteuid16
31698 +       .quad sys_getegid16     /* 50 */
31699 +       .quad sys_acct
31700 +       .quad sys_umount                        /* new_umount */
31701 +       .quad quiet_ni_syscall                  /* old lock syscall holder */
31702 +       .quad compat_sys_ioctl
31703 +       .quad compat_sys_fcntl64                /* 55 */
31704 +       .quad quiet_ni_syscall                  /* old mpx syscall holder */
31705 +       .quad sys_setpgid
31706 +       .quad quiet_ni_syscall                  /* old ulimit syscall holder */
31707 +       .quad sys32_olduname
31708 +       .quad sys_umask         /* 60 */
31709 +       .quad sys_chroot
31710 +       .quad sys32_ustat
31711 +       .quad sys_dup2
31712 +       .quad sys_getppid
31713 +       .quad sys_getpgrp               /* 65 */
31714 +       .quad sys_setsid
31715 +       .quad sys32_sigaction
31716 +       .quad sys_sgetmask
31717 +       .quad sys_ssetmask
31718 +       .quad sys_setreuid16    /* 70 */
31719 +       .quad sys_setregid16
31720 +       .quad stub32_sigsuspend
31721 +       .quad compat_sys_sigpending
31722 +       .quad sys_sethostname
31723 +       .quad compat_sys_setrlimit      /* 75 */
31724 +       .quad compat_sys_old_getrlimit  /* old_getrlimit */
31725 +       .quad compat_sys_getrusage
31726 +       .quad sys32_gettimeofday
31727 +       .quad sys32_settimeofday
31728 +       .quad sys_getgroups16   /* 80 */
31729 +       .quad sys_setgroups16
31730 +       .quad sys32_old_select
31731 +       .quad sys_symlink
31732 +       .quad sys_lstat
31733 +       .quad sys_readlink              /* 85 */
31734 +#ifdef CONFIG_IA32_AOUT
31735 +       .quad sys_uselib
31736 +#else
31737 +       .quad quiet_ni_syscall
31738 +#endif
31739 +       .quad sys_swapon
31740 +       .quad sys_reboot
31741 +       .quad compat_sys_old_readdir
31742 +       .quad sys32_mmap                /* 90 */
31743 +       .quad sys_munmap
31744 +       .quad sys_truncate
31745 +       .quad sys_ftruncate
31746 +       .quad sys_fchmod
31747 +       .quad sys_fchown16              /* 95 */
31748 +       .quad sys_getpriority
31749 +       .quad sys_setpriority
31750 +       .quad quiet_ni_syscall                  /* old profil syscall holder */
31751 +       .quad compat_sys_statfs
31752 +       .quad compat_sys_fstatfs                /* 100 */
31753 +       .quad sys_ioperm
31754 +       .quad compat_sys_socketcall
31755 +       .quad sys_syslog
31756 +       .quad compat_sys_setitimer
31757 +       .quad compat_sys_getitimer      /* 105 */
31758 +       .quad compat_sys_newstat
31759 +       .quad compat_sys_newlstat
31760 +       .quad compat_sys_newfstat
31761 +       .quad sys32_uname
31762 +       .quad stub32_iopl               /* 110 */
31763 +       .quad sys_vhangup
31764 +       .quad quiet_ni_syscall  /* old "idle" system call */
31765 +       .quad sys32_vm86_warning        /* vm86old */ 
31766 +       .quad compat_sys_wait4
31767 +       .quad sys_swapoff               /* 115 */
31768 +       .quad sys32_sysinfo
31769 +       .quad sys32_ipc
31770 +       .quad sys_fsync
31771 +       .quad stub32_sigreturn
31772 +       .quad stub32_clone              /* 120 */
31773 +       .quad sys_setdomainname
31774 +       .quad sys_uname
31775 +       .quad sys_modify_ldt
31776 +       .quad compat_sys_adjtimex
31777 +       .quad sys32_mprotect            /* 125 */
31778 +       .quad compat_sys_sigprocmask
31779 +       .quad quiet_ni_syscall          /* create_module */
31780 +       .quad sys_init_module
31781 +       .quad sys_delete_module
31782 +       .quad quiet_ni_syscall          /* 130  get_kernel_syms */
31783 +       .quad sys_quotactl
31784 +       .quad sys_getpgid
31785 +       .quad sys_fchdir
31786 +       .quad quiet_ni_syscall  /* bdflush */
31787 +       .quad sys_sysfs         /* 135 */
31788 +       .quad sys_personality
31789 +       .quad quiet_ni_syscall  /* for afs_syscall */
31790 +       .quad sys_setfsuid16
31791 +       .quad sys_setfsgid16
31792 +       .quad sys_llseek                /* 140 */
31793 +       .quad compat_sys_getdents
31794 +       .quad compat_sys_select
31795 +       .quad sys_flock
31796 +       .quad sys_msync
31797 +       .quad compat_sys_readv          /* 145 */
31798 +       .quad compat_sys_writev
31799 +       .quad sys_getsid
31800 +       .quad sys_fdatasync
31801 +       .quad sys32_sysctl      /* sysctl */
31802 +       .quad sys_mlock         /* 150 */
31803 +       .quad sys_munlock
31804 +       .quad sys_mlockall
31805 +       .quad sys_munlockall
31806 +       .quad sys_sched_setparam
31807 +       .quad sys_sched_getparam   /* 155 */
31808 +       .quad sys_sched_setscheduler
31809 +       .quad sys_sched_getscheduler
31810 +       .quad sys_sched_yield
31811 +       .quad sys_sched_get_priority_max
31812 +       .quad sys_sched_get_priority_min  /* 160 */
31813 +       .quad sys_sched_rr_get_interval
31814 +       .quad compat_sys_nanosleep
31815 +       .quad sys_mremap
31816 +       .quad sys_setresuid16
31817 +       .quad sys_getresuid16   /* 165 */
31818 +       .quad sys32_vm86_warning        /* vm86 */ 
31819 +       .quad quiet_ni_syscall  /* query_module */
31820 +       .quad sys_poll
31821 +       .quad compat_sys_nfsservctl
31822 +       .quad sys_setresgid16   /* 170 */
31823 +       .quad sys_getresgid16
31824 +       .quad sys_prctl
31825 +       .quad stub32_rt_sigreturn
31826 +       .quad sys32_rt_sigaction
31827 +       .quad sys32_rt_sigprocmask      /* 175 */
31828 +       .quad sys32_rt_sigpending
31829 +       .quad compat_sys_rt_sigtimedwait
31830 +       .quad sys32_rt_sigqueueinfo
31831 +       .quad stub32_rt_sigsuspend
31832 +       .quad sys32_pread               /* 180 */
31833 +       .quad sys32_pwrite
31834 +       .quad sys_chown16
31835 +       .quad sys_getcwd
31836 +       .quad sys_capget
31837 +       .quad sys_capset
31838 +       .quad stub32_sigaltstack
31839 +       .quad sys32_sendfile
31840 +       .quad quiet_ni_syscall          /* streams1 */
31841 +       .quad quiet_ni_syscall          /* streams2 */
31842 +       .quad stub32_vfork            /* 190 */
31843 +       .quad compat_sys_getrlimit
31844 +       .quad sys32_mmap2
31845 +       .quad sys32_truncate64
31846 +       .quad sys32_ftruncate64
31847 +       .quad sys32_stat64              /* 195 */
31848 +       .quad sys32_lstat64
31849 +       .quad sys32_fstat64
31850 +       .quad sys_lchown
31851 +       .quad sys_getuid
31852 +       .quad sys_getgid                /* 200 */
31853 +       .quad sys_geteuid
31854 +       .quad sys_getegid
31855 +       .quad sys_setreuid
31856 +       .quad sys_setregid
31857 +       .quad sys_getgroups     /* 205 */
31858 +       .quad sys_setgroups
31859 +       .quad sys_fchown
31860 +       .quad sys_setresuid
31861 +       .quad sys_getresuid
31862 +       .quad sys_setresgid     /* 210 */
31863 +       .quad sys_getresgid
31864 +       .quad sys_chown
31865 +       .quad sys_setuid
31866 +       .quad sys_setgid
31867 +       .quad sys_setfsuid              /* 215 */
31868 +       .quad sys_setfsgid
31869 +       .quad sys_pivot_root
31870 +       .quad sys_mincore
31871 +       .quad sys_madvise
31872 +       .quad compat_sys_getdents64     /* 220 getdents64 */
31873 +       .quad compat_sys_fcntl64        
31874 +       .quad quiet_ni_syscall          /* tux */
31875 +       .quad quiet_ni_syscall          /* security */
31876 +       .quad sys_gettid        
31877 +       .quad sys_readahead     /* 225 */
31878 +       .quad sys_setxattr
31879 +       .quad sys_lsetxattr
31880 +       .quad sys_fsetxattr
31881 +       .quad sys_getxattr
31882 +       .quad sys_lgetxattr     /* 230 */
31883 +       .quad sys_fgetxattr
31884 +       .quad sys_listxattr
31885 +       .quad sys_llistxattr
31886 +       .quad sys_flistxattr
31887 +       .quad sys_removexattr   /* 235 */
31888 +       .quad sys_lremovexattr
31889 +       .quad sys_fremovexattr
31890 +       .quad sys_tkill
31891 +       .quad sys_sendfile64 
31892 +       .quad compat_sys_futex          /* 240 */
31893 +       .quad compat_sys_sched_setaffinity
31894 +       .quad compat_sys_sched_getaffinity
31895 +       .quad sys32_set_thread_area
31896 +       .quad sys32_get_thread_area
31897 +       .quad compat_sys_io_setup       /* 245 */
31898 +       .quad sys_io_destroy
31899 +       .quad compat_sys_io_getevents
31900 +       .quad compat_sys_io_submit
31901 +       .quad sys_io_cancel
31902 +       .quad sys_fadvise64             /* 250 */
31903 +       .quad quiet_ni_syscall  /* free_huge_pages */
31904 +       .quad sys_exit_group
31905 +       .quad sys32_lookup_dcookie
31906 +       .quad sys_epoll_create
31907 +       .quad sys_epoll_ctl             /* 255 */
31908 +       .quad sys_epoll_wait
31909 +       .quad sys_remap_file_pages
31910 +       .quad sys_set_tid_address
31911 +       .quad compat_sys_timer_create
31912 +       .quad compat_sys_timer_settime  /* 260 */
31913 +       .quad compat_sys_timer_gettime
31914 +       .quad sys_timer_getoverrun
31915 +       .quad sys_timer_delete
31916 +       .quad compat_sys_clock_settime
31917 +       .quad compat_sys_clock_gettime  /* 265 */
31918 +       .quad compat_sys_clock_getres
31919 +       .quad compat_sys_clock_nanosleep
31920 +       .quad compat_sys_statfs64
31921 +       .quad compat_sys_fstatfs64
31922 +       .quad sys_tgkill                /* 270 */
31923 +       .quad compat_sys_utimes
31924 +       .quad sys32_fadvise64_64
31925 +       .quad quiet_ni_syscall  /* sys_vserver */
31926 +       .quad sys_mbind
31927 +       .quad compat_sys_get_mempolicy  /* 275 */
31928 +       .quad sys_set_mempolicy
31929 +       .quad compat_sys_mq_open
31930 +       .quad sys_mq_unlink
31931 +       .quad compat_sys_mq_timedsend
31932 +       .quad compat_sys_mq_timedreceive        /* 280 */
31933 +       .quad compat_sys_mq_notify
31934 +       .quad compat_sys_mq_getsetattr
31935 +       .quad compat_sys_kexec_load     /* reserved for kexec */
31936 +       .quad compat_sys_waitid
31937 +       .quad quiet_ni_syscall          /* 285: sys_altroot */
31938 +       .quad sys_add_key
31939 +       .quad sys_request_key
31940 +       .quad sys_keyctl
31941 +       .quad sys_ioprio_set
31942 +       .quad sys_ioprio_get            /* 290 */
31943 +       .quad sys_inotify_init
31944 +       .quad sys_inotify_add_watch
31945 +       .quad sys_inotify_rm_watch
31946 +       .quad sys_migrate_pages
31947 +       .quad compat_sys_openat         /* 295 */
31948 +       .quad sys_mkdirat
31949 +       .quad sys_mknodat
31950 +       .quad sys_fchownat
31951 +       .quad compat_sys_futimesat
31952 +       .quad sys32_fstatat             /* 300 */
31953 +       .quad sys_unlinkat
31954 +       .quad sys_renameat
31955 +       .quad sys_linkat
31956 +       .quad sys_symlinkat
31957 +       .quad sys_readlinkat            /* 305 */
31958 +       .quad sys_fchmodat
31959 +       .quad sys_faccessat
31960 +       .quad compat_sys_pselect6
31961 +       .quad compat_sys_ppoll
31962 +       .quad sys_unshare               /* 310 */
31963 +       .quad compat_sys_set_robust_list
31964 +       .quad compat_sys_get_robust_list
31965 +       .quad sys_splice
31966 +       .quad sys_sync_file_range
31967 +       .quad sys_tee
31968 +       .quad compat_sys_vmsplice
31969 +       .quad compat_sys_move_pages
31970 +       .quad sys_getcpu
31971 +ia32_syscall_end:              
31972 diff -ruNp linux-2.6.19/arch/x86_64/ia32/syscall32-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/syscall32-xen.c
31973 --- linux-2.6.19/arch/x86_64/ia32/syscall32-xen.c       1970-01-01 00:00:00.000000000 +0000
31974 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/syscall32-xen.c     2007-02-02 19:10:26.000000000 +0000
31975 @@ -0,0 +1,128 @@
31976 +/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
31977 +
31978 +/* vsyscall handling for 32bit processes. Map a stub page into it 
31979 +   on demand because 32bit cannot reach the kernel's fixmaps */
31980 +
31981 +#include <linux/mm.h>
31982 +#include <linux/string.h>
31983 +#include <linux/kernel.h>
31984 +#include <linux/gfp.h>
31985 +#include <linux/init.h>
31986 +#include <linux/stringify.h>
31987 +#include <linux/security.h>
31988 +#include <asm/proto.h>
31989 +#include <asm/tlbflush.h>
31990 +#include <asm/ia32_unistd.h>
31991 +
31992 +#ifdef USE_INT80
31993 +extern unsigned char syscall32_int80[], syscall32_int80_end[];
31994 +#endif
31995 +extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
31996 +extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
31997 +extern int sysctl_vsyscall32;
31998 +
31999 +char *syscall32_page; 
32000 +#ifndef USE_INT80
32001 +static int use_sysenter = -1;
32002 +#endif
32003 +
32004 +static struct page *
32005 +syscall32_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
32006 +{
32007 +       struct page *p = virt_to_page(adr - vma->vm_start + syscall32_page);
32008 +       get_page(p);
32009 +       return p;
32010 +}
32011 +
32012 +/* Prevent VMA merging */
32013 +static void syscall32_vma_close(struct vm_area_struct *vma)
32014 +{
32015 +}
32016 +
32017 +static struct vm_operations_struct syscall32_vm_ops = {
32018 +       .close = syscall32_vma_close,
32019 +       .nopage = syscall32_nopage,
32020 +};
32021 +
32022 +struct linux_binprm;
32023 +
32024 +/* Setup a VMA at program startup for the vsyscall page */
32025 +int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
32026 +{
32027 +       int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
32028 +       struct vm_area_struct *vma;
32029 +       struct mm_struct *mm = current->mm;
32030 +       int ret;
32031 +
32032 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
32033 +       if (!vma)
32034 +               return -ENOMEM;
32035 +
32036 +       memset(vma, 0, sizeof(struct vm_area_struct));
32037 +       /* Could randomize here */
32038 +       vma->vm_start = VSYSCALL32_BASE;
32039 +       vma->vm_end = VSYSCALL32_END;
32040 +       /* MAYWRITE to allow gdb to COW and set breakpoints */
32041 +       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
32042 +       vma->vm_flags |= mm->def_flags;
32043 +       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
32044 +       vma->vm_ops = &syscall32_vm_ops;
32045 +       vma->vm_mm = mm;
32046 +
32047 +       down_write(&mm->mmap_sem);
32048 +       if ((ret = insert_vm_struct(mm, vma))) {
32049 +               up_write(&mm->mmap_sem);
32050 +               kmem_cache_free(vm_area_cachep, vma);
32051 +               return ret;
32052 +       }
32053 +       mm->total_vm += npages;
32054 +       up_write(&mm->mmap_sem);
32055 +       return 0;
32056 +}
32057 +
32058 +static int __init init_syscall32(void)
32059 +{ 
32060 +       syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); 
32061 +       if (!syscall32_page) 
32062 +               panic("Cannot allocate syscall32 page"); 
32063 +
32064 +#ifdef USE_INT80
32065 +       /*
32066 +        * At this point we use int 0x80.
32067 +        */
32068 +       memcpy(syscall32_page, syscall32_int80,
32069 +              syscall32_int80_end - syscall32_int80);
32070 +#else
32071 +       if (use_sysenter > 0) {
32072 +               memcpy(syscall32_page, syscall32_sysenter,
32073 +                      syscall32_sysenter_end - syscall32_sysenter);
32074 +       } else {
32075 +               memcpy(syscall32_page, syscall32_syscall,
32076 +                      syscall32_syscall_end - syscall32_syscall);
32077 +       }       
32078 +#endif
32079 +       return 0;
32080 +} 
32081 +
32082 +/*
32083 + * This must be done early in case we have an initrd containing 32-bit
32084 + * binaries (e.g., hotplug). This could be pushed upstream to arch/x86_64.
32085 + */    
32086 +core_initcall(init_syscall32); 
32087 +
32088 +/* May not be __init: called during resume */
32089 +void syscall32_cpu_init(void)
32090 +{
32091 +#ifndef USE_INT80
32092 +       if (use_sysenter < 0)
32093 +               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
32094 +
32095 +       /* Load these always in case some future AMD CPU supports
32096 +          SYSENTER from compat mode too. */
32097 +       checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
32098 +       checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
32099 +       checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
32100 +
32101 +       wrmsrl(MSR_CSTAR, ia32_cstar_target);
32102 +#endif
32103 +}
32104 diff -ruNp linux-2.6.19/arch/x86_64/ia32/syscall32_syscall-xen.S linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/syscall32_syscall-xen.S
32105 --- linux-2.6.19/arch/x86_64/ia32/syscall32_syscall-xen.S       1970-01-01 00:00:00.000000000 +0000
32106 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/syscall32_syscall-xen.S     2007-02-02 19:10:26.000000000 +0000
32107 @@ -0,0 +1,28 @@
32108 +/* 32bit VDSOs mapped into user space. */
32109 +
32110 +       .section ".init.data","aw"
32111 +
32112 +#ifdef USE_INT80
32113 +
32114 +       .globl syscall32_int80
32115 +       .globl syscall32_int80_end
32116 +
32117 +syscall32_int80:
32118 +       .incbin "arch/x86_64/ia32/vsyscall-int80.so"
32119 +syscall32_int80_end:
32120 +
32121 +#endif
32122 +
32123 +       .globl syscall32_syscall
32124 +       .globl syscall32_syscall_end
32125 +
32126 +syscall32_syscall:
32127 +       .incbin "arch/x86_64/ia32/vsyscall-syscall.so"
32128 +syscall32_syscall_end:
32129 +
32130 +       .globl syscall32_sysenter
32131 +       .globl syscall32_sysenter_end
32132 +
32133 +syscall32_sysenter:
32134 +       .incbin "arch/x86_64/ia32/vsyscall-sysenter.so"
32135 +syscall32_sysenter_end:
32136 diff -ruNp linux-2.6.19/arch/x86_64/ia32/vsyscall-int80.S linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/vsyscall-int80.S
32137 --- linux-2.6.19/arch/x86_64/ia32/vsyscall-int80.S      1970-01-01 00:00:00.000000000 +0000
32138 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/vsyscall-int80.S    2007-02-02 19:10:26.000000000 +0000
32139 @@ -0,0 +1,58 @@
32140 +/*
32141 + * Code for the vsyscall page.  This version uses the old int $0x80 method.
32142 + *
32143 + * NOTE:
32144 + * 1) __kernel_vsyscall _must_ be first in this page.
32145 + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
32146 + *    for details.
32147 + */
32148 +#include <asm/ia32_unistd.h>
32149 +#include <asm/asm-offsets.h>
32150 +
32151 +       .code32
32152 +       .text
32153 +       .section .text.vsyscall,"ax"
32154 +       .globl __kernel_vsyscall
32155 +       .type __kernel_vsyscall,@function
32156 +__kernel_vsyscall:
32157 +.LSTART_vsyscall:
32158 +       int $0x80
32159 +       ret
32160 +.LEND_vsyscall:
32161 +       .size __kernel_vsyscall,.-.LSTART_vsyscall
32162 +       .previous
32163 +
32164 +       .section .eh_frame,"a",@progbits
32165 +.LSTARTFRAME:
32166 +       .long .LENDCIE-.LSTARTCIE
32167 +.LSTARTCIE:
32168 +       .long 0                 /* CIE ID */
32169 +       .byte 1                 /* Version number */
32170 +       .string "zR"            /* NUL-terminated augmentation string */
32171 +       .uleb128 1              /* Code alignment factor */
32172 +       .sleb128 -4             /* Data alignment factor */
32173 +       .byte 8                 /* Return address register column */
32174 +       .uleb128 1              /* Augmentation value length */
32175 +       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
32176 +       .byte 0x0c              /* DW_CFA_def_cfa */
32177 +       .uleb128 4
32178 +       .uleb128 4
32179 +       .byte 0x88              /* DW_CFA_offset, column 0x8 */
32180 +       .uleb128 1
32181 +       .align 4
32182 +.LENDCIE:
32183 +
32184 +       .long .LENDFDE1-.LSTARTFDE1     /* Length FDE */
32185 +.LSTARTFDE1:
32186 +       .long .LSTARTFDE1-.LSTARTFRAME  /* CIE pointer */
32187 +       .long .LSTART_vsyscall-.        /* PC-relative start address */
32188 +       .long .LEND_vsyscall-.LSTART_vsyscall
32189 +       .uleb128 0                      /* Augmentation length */
32190 +       .align 4
32191 +.LENDFDE1:
32192 +               
32193 +/*
32194 + * Get the common code for the sigreturn entry points.
32195 + */
32196 +#define SYSCALL_ENTER_KERNEL    int $0x80
32197 +#include "vsyscall-sigreturn.S"
32198 diff -ruNp linux-2.6.19/arch/x86_64/ia32/vsyscall-sigreturn.S linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/vsyscall-sigreturn.S
32199 --- linux-2.6.19/arch/x86_64/ia32/vsyscall-sigreturn.S  2006-11-29 21:57:37.000000000 +0000
32200 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/ia32/vsyscall-sigreturn.S        2007-02-02 19:10:26.000000000 +0000
32201 @@ -139,5 +139,5 @@ __kernel_rt_sigreturn:
32202         .align 4
32203  .LENDFDE3:
32204  
32205 -#include "../../i386/kernel/vsyscall-note.S"
32206 +#include <vsyscall-note.S>
32207  
32208 diff -ruNp linux-2.6.19/arch/x86_64/kernel/Makefile linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/Makefile
32209 --- linux-2.6.19/arch/x86_64/kernel/Makefile    2006-11-29 21:57:37.000000000 +0000
32210 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/Makefile  2007-02-02 19:10:26.000000000 +0000
32211 @@ -21,11 +21,13 @@ obj-$(CONFIG_MICROCODE)             += microcode.o
32212  obj-$(CONFIG_X86_CPUID)                += cpuid.o
32213  obj-$(CONFIG_SMP)              += smp.o smpboot.o trampoline.o
32214  obj-y                          += apic.o  nmi.o
32215 +obj-$(CONFIG_X86_XEN_GENAPIC)  += genapic.o genapic_xen.o
32216  obj-y                          += io_apic.o mpparse.o \
32217                 genapic.o genapic_cluster.o genapic_flat.o
32218  obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o crash.o
32219  obj-$(CONFIG_CRASH_DUMP)       += crash_dump.o
32220 -obj-$(CONFIG_PM)               += suspend.o
32221 +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
32222 +obj-$(CONFIG_ACPI_SLEEP)       += suspend.o
32223  obj-$(CONFIG_SUSPEND_SHARED)   += suspend_asm.o
32224  obj-$(CONFIG_CPU_FREQ)         += cpufreq/
32225  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
32226 @@ -56,3 +58,19 @@ quirks-y                     += ../../i386/kernel/quirks.o
32227  i8237-y                                += ../../i386/kernel/i8237.o
32228  msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
32229  alternative-y                  += ../../i386/kernel/alternative.o
32230 +
32231 +ifdef CONFIG_XEN
32232 +obj-y                          += irqflags.o
32233 +time-y                         += ../../i386/kernel/time-xen.o
32234 +pci-dma-y                      += ../../i386/kernel/pci-dma-xen.o
32235 +microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
32236 +quirks-y                       := ../../i386/kernel/quirks-xen.o
32237 +
32238 +n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
32239 +
32240 +include $(srctree)/scripts/Makefile.xen
32241 +
32242 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
32243 +obj-y := $(call cherrypickxen, $(obj-y))
32244 +extra-y := $(call cherrypickxen, $(extra-y))
32245 +endif
32246 diff -ruNp linux-2.6.19/arch/x86_64/kernel/acpi/Makefile linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/acpi/Makefile
32247 --- linux-2.6.19/arch/x86_64/kernel/acpi/Makefile       2006-11-29 21:57:37.000000000 +0000
32248 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/acpi/Makefile     2007-02-02 19:10:26.000000000 +0000
32249 @@ -7,3 +7,4 @@ obj-y                   += processor.o
32250  processor-y            := ../../../i386/kernel/acpi/processor.o ../../../i386/kernel/acpi/cstate.o
32251  endif
32252  
32253 +boot-$(CONFIG_XEN)             := ../../../i386/kernel/acpi/boot-xen.o
32254 diff -ruNp linux-2.6.19/arch/x86_64/kernel/apic-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/apic-xen.c
32255 --- linux-2.6.19/arch/x86_64/kernel/apic-xen.c  1970-01-01 00:00:00.000000000 +0000
32256 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/apic-xen.c        2007-02-02 19:10:26.000000000 +0000
32257 @@ -0,0 +1,254 @@
32258 +/*
32259 + *     Local APIC handling, local APIC timers
32260 + *
32261 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
32262 + *
32263 + *     Fixes
32264 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
32265 + *                                     thanks to Eric Gilmore
32266 + *                                     and Rolf G. Tews
32267 + *                                     for testing these extensively.
32268 + *     Maciej W. Rozycki       :       Various updates and fixes.
32269 + *     Mikael Pettersson       :       Power Management for UP-APIC.
32270 + *     Pavel Machek and
32271 + *     Mikael Pettersson       :       PM converted to driver model.
32272 + */
32273 +
32274 +#include <linux/init.h>
32275 +
32276 +#include <linux/mm.h>
32277 +#include <linux/delay.h>
32278 +#include <linux/bootmem.h>
32279 +#include <linux/smp_lock.h>
32280 +#include <linux/interrupt.h>
32281 +#include <linux/mc146818rtc.h>
32282 +#include <linux/kernel_stat.h>
32283 +#include <linux/sysdev.h>
32284 +#include <linux/module.h>
32285 +#include <linux/ioport.h>
32286 +
32287 +#include <asm/atomic.h>
32288 +#include <asm/smp.h>
32289 +#include <asm/mtrr.h>
32290 +#include <asm/mpspec.h>
32291 +#include <asm/pgalloc.h>
32292 +#include <asm/mach_apic.h>
32293 +#include <asm/nmi.h>
32294 +#include <asm/idle.h>
32295 +#include <asm/proto.h>
32296 +#include <asm/timex.h>
32297 +#include <asm/apic.h>
32298 +
32299 +int apic_mapped;
32300 +int apic_verbosity;
32301 +
32302 +/*
32303 + * 'what should we do if we get a hw irq event on an illegal vector'.
32304 + * each architecture has to answer this themselves.
32305 + */
32306 +void ack_bad_irq(unsigned int irq)
32307 +{
32308 +       printk("unexpected IRQ trap at vector %02x\n", irq);
32309 +       /*
32310 +        * Currently unexpected vectors happen only on SMP and APIC.
32311 +        * We _must_ ack these because every local APIC has only N
32312 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
32313 +        * holds up an irq slot - in excessive cases (when multiple
32314 +        * unexpected vectors occur) that might lock up the APIC
32315 +        * completely.
32316 +        * But don't ack when the APIC is disabled. -AK
32317 +        */
32318 +       if (!disable_apic)
32319 +               ack_APIC_irq();
32320 +}
32321 +
32322 +int setup_profiling_timer(unsigned int multiplier)
32323 +{
32324 +       return -EINVAL;
32325 +}
32326 +
32327 +void smp_local_timer_interrupt(void)
32328 +{
32329 +       profile_tick(CPU_PROFILING);
32330 +#ifndef CONFIG_XEN
32331 +#ifdef CONFIG_SMP
32332 +       update_process_times(user_mode(get_irq_regs()));
32333 +#endif
32334 +       if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
32335 +               main_timer_handler();
32336 +#endif
32337 +       /*
32338 +        * We take the 'long' return path, and there every subsystem
32339 +        * grabs the appropriate locks (kernel lock/ irq lock).
32340 +        *
32341 +        * We might want to decouple profiling from the 'long path',
32342 +        * and do the profiling totally in assembly.
32343 +        *
32344 +        * Currently this isn't too much of an issue (performance wise),
32345 +        * we can take more than 100K local irqs per second on a 100 MHz P5.
32346 +        */
32347 +}
32348 +
32349 +/*
32350 + * Local APIC timer interrupt. This is the most natural way for doing
32351 + * local interrupts, but local timer interrupts can be emulated by
32352 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
32353 + *
32354 + * [ if a single-CPU system runs an SMP kernel then we call the local
32355 + *   interrupt as well. Thus we cannot inline the local irq ... ]
32356 + */
32357 +void smp_apic_timer_interrupt(struct pt_regs *regs)
32358 +{
32359 +       struct pt_regs *old_regs = set_irq_regs(regs);
32360 +
32361 +       /*
32362 +        * the NMI deadlock-detector uses this.
32363 +        */
32364 +       add_pda(apic_timer_irqs, 1);
32365 +
32366 +       /*
32367 +        * NOTE! We'd better ACK the irq immediately,
32368 +        * because timer handling can be slow.
32369 +        */
32370 +       ack_APIC_irq();
32371 +       /*
32372 +        * update_process_times() expects us to have done irq_enter().
32373 +        * Besides, if we don't timer interrupts ignore the global
32374 +        * interrupt lock, which is the WrongThing (tm) to do.
32375 +        */
32376 +       exit_idle();
32377 +       irq_enter();
32378 +       smp_local_timer_interrupt();
32379 +       irq_exit();
32380 +       set_irq_regs(old_regs);
32381 +}
32382 +
32383 +/*
32384 + * This interrupt should _never_ happen with our APIC/SMP architecture
32385 + */
32386 +asmlinkage void smp_spurious_interrupt(void)
32387 +{
32388 +       unsigned int v;
32389 +       exit_idle();
32390 +       irq_enter();
32391 +       /*
32392 +        * Check if this really is a spurious interrupt and ACK it
32393 +        * if it is a vectored one.  Just in case...
32394 +        * Spurious interrupts should not be ACKed.
32395 +        */
32396 +       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
32397 +       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
32398 +               ack_APIC_irq();
32399 +
32400 +#if 0
32401 +       static unsigned long last_warning; 
32402 +       static unsigned long skipped; 
32403 +
32404 +       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
32405 +       if (time_before(last_warning+30*HZ,jiffies)) { 
32406 +               printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
32407 +                      smp_processor_id(), skipped);
32408 +               last_warning = jiffies; 
32409 +               skipped = 0;
32410 +       } else { 
32411 +               skipped++; 
32412 +       } 
32413 +#endif 
32414 +       irq_exit();
32415 +}
32416 +
32417 +/*
32418 + * This interrupt should never happen with our APIC/SMP architecture
32419 + */
32420 +
32421 +asmlinkage void smp_error_interrupt(void)
32422 +{
32423 +       unsigned int v, v1;
32424 +
32425 +       exit_idle();
32426 +       irq_enter();
32427 +       /* First tickle the hardware, only then report what went on. -- REW */
32428 +       v = apic_read(APIC_ESR);
32429 +       apic_write(APIC_ESR, 0);
32430 +       v1 = apic_read(APIC_ESR);
32431 +       ack_APIC_irq();
32432 +       atomic_inc(&irq_err_count);
32433 +
32434 +       /* Here is what the APIC error bits mean:
32435 +          0: Send CS error
32436 +          1: Receive CS error
32437 +          2: Send accept error
32438 +          3: Receive accept error
32439 +          4: Reserved
32440 +          5: Send illegal vector
32441 +          6: Received illegal vector
32442 +          7: Illegal register address
32443 +       */
32444 +       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
32445 +               smp_processor_id(), v , v1);
32446 +       irq_exit();
32447 +}
32448 +
32449 +int disable_apic; 
32450 +
32451 +/*
32452 + * This initializes the IO-APIC and APIC hardware if this is
32453 + * a UP kernel.
32454 + */
32455 +int __init APIC_init_uniprocessor (void)
32456 +{
32457 +       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
32458 +               setup_IO_APIC();
32459 +       return 0;
32460 +}
32461 +
32462 +#ifndef CONFIG_XEN
32463 +static __init int setup_disableapic(char *str) 
32464 +{ 
32465 +       disable_apic = 1;
32466 +       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
32467 +       return 0;
32468 +}
32469 +early_param("disableapic", setup_disableapic);
32470 +
32471 +/* same as disableapic, for compatibility */
32472 +static __init int setup_nolapic(char *str) 
32473 +{ 
32474 +       return setup_disableapic(str);
32475 +} 
32476 +early_param("nolapic", setup_nolapic);
32477 +
32478 +static __init int setup_noapictimer(char *str) 
32479 +{ 
32480 +       if (str[0] != ' ' && str[0] != 0)
32481 +               return 0;
32482 +       disable_apic_timer = 1;
32483 +       return 1;
32484 +} 
32485 +
32486 +static __init int setup_apicmaintimer(char *str)
32487 +{
32488 +       apic_runs_main_timer = 1;
32489 +       nohpet = 1;
32490 +       return 1;
32491 +}
32492 +__setup("apicmaintimer", setup_apicmaintimer);
32493 +
32494 +static __init int setup_noapicmaintimer(char *str)
32495 +{
32496 +       apic_runs_main_timer = -1;
32497 +       return 1;
32498 +}
32499 +__setup("noapicmaintimer", setup_noapicmaintimer);
32500 +
32501 +static __init int setup_apicpmtimer(char *s)
32502 +{
32503 +       apic_calibrate_pmtmr = 1;
32504 +       notsc_setup(NULL);
32505 +       return setup_apicmaintimer(NULL);
32506 +}
32507 +__setup("apicpmtimer", setup_apicpmtimer);
32508 +
32509 +__setup("noapictimer", setup_noapictimer); 
32510 +
32511 +#endif
32512 diff -ruNp linux-2.6.19/arch/x86_64/kernel/asm-offsets.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/asm-offsets.c
32513 --- linux-2.6.19/arch/x86_64/kernel/asm-offsets.c       2006-11-29 21:57:37.000000000 +0000
32514 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/asm-offsets.c     2007-02-02 19:10:26.000000000 +0000
32515 @@ -67,8 +67,10 @@ int main(void)
32516         DEFINE(pbe_address, offsetof(struct pbe, address));
32517         DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
32518         DEFINE(pbe_next, offsetof(struct pbe, next));
32519 +#ifndef CONFIG_X86_NO_TSS
32520         BLANK();
32521         DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
32522 +#endif
32523         BLANK();
32524         DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
32525         return 0;
32526 diff -ruNp linux-2.6.19/arch/x86_64/kernel/crash.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/crash.c
32527 --- linux-2.6.19/arch/x86_64/kernel/crash.c     2006-11-29 21:57:37.000000000 +0000
32528 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/crash.c   2007-02-02 19:10:26.000000000 +0000
32529 @@ -93,6 +93,7 @@ static void crash_save_self(struct pt_re
32530         crash_save_this_cpu(regs, cpu);
32531  }
32532  
32533 +#ifndef CONFIG_XEN
32534  #ifdef CONFIG_SMP
32535  static atomic_t waiting_for_crash_ipi;
32536  
32537 @@ -172,6 +173,7 @@ static void nmi_shootdown_cpus(void)
32538         /* There are no cpus to shootdown */
32539  }
32540  #endif
32541 +#endif /* CONFIG_XEN */
32542  
32543  void machine_crash_shutdown(struct pt_regs *regs)
32544  {
32545 @@ -189,12 +191,14 @@ void machine_crash_shutdown(struct pt_re
32546  
32547         /* Make a note of crashing cpu. Will be used in NMI callback.*/
32548         crashing_cpu = smp_processor_id();
32549 +
32550 +#ifndef CONFIG_XEN
32551         nmi_shootdown_cpus();
32552  
32553         if(cpu_has_apic)
32554                  disable_local_APIC();
32555  
32556         disable_IO_APIC();
32557 -
32558 +#endif /* CONFIG_XEN */
32559         crash_save_self(regs);
32560  }
32561 diff -ruNp linux-2.6.19/arch/x86_64/kernel/e820-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/e820-xen.c
32562 --- linux-2.6.19/arch/x86_64/kernel/e820-xen.c  1970-01-01 00:00:00.000000000 +0000
32563 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/e820-xen.c        2007-02-02 19:10:26.000000000 +0000
32564 @@ -0,0 +1,763 @@
32565 +/* 
32566 + * Handle the memory map.
32567 + * The functions here do the job until bootmem takes over.
32568 + *
32569 + *  Getting sanitize_e820_map() in sync with i386 version by applying change:
32570 + *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
32571 + *     Alex Achenbach <xela@slit.de>, December 2002.
32572 + *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
32573 + *
32574 + */
32575 +#include <linux/kernel.h>
32576 +#include <linux/types.h>
32577 +#include <linux/init.h>
32578 +#include <linux/bootmem.h>
32579 +#include <linux/ioport.h>
32580 +#include <linux/string.h>
32581 +#include <linux/kexec.h>
32582 +#include <linux/module.h>
32583 +#include <linux/mm.h>
32584 +
32585 +#include <asm/pgtable.h>
32586 +#include <asm/page.h>
32587 +#include <asm/e820.h>
32588 +#include <asm/proto.h>
32589 +#include <asm/bootsetup.h>
32590 +#include <asm/sections.h>
32591 +#include <xen/interface/memory.h>
32592 +
32593 +struct e820map e820 __initdata;
32594 +
32595 +/* 
32596 + * PFN of last memory page.
32597 + */
32598 +unsigned long end_pfn; 
32599 +EXPORT_SYMBOL(end_pfn);
32600 +
32601 +/* 
32602 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
32603 + * The direct mapping extends to end_pfn_map, so that we can directly access
32604 + * apertures, ACPI and other tables without having to play with fixmaps.
32605 + */ 
32606 +unsigned long end_pfn_map; 
32607 +
32608 +/* 
32609 + * Last pfn which the user wants to use.
32610 + */
32611 +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
32612 +
32613 +extern struct resource code_resource, data_resource;
32614 +
32615 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
32616 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
32617 +{ 
32618 +       unsigned long addr = *addrp, last = addr + size; 
32619 +
32620 +#ifndef CONFIG_XEN
32621 +       /* various gunk below that needed for SMP startup */
32622 +       if (addr < 0x8000) { 
32623 +               *addrp = PAGE_ALIGN(0x8000);
32624 +               return 1; 
32625 +       }
32626 +
32627 +       /* direct mapping tables of the kernel */
32628 +       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
32629 +               *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
32630 +               return 1;
32631 +       } 
32632 +
32633 +       /* initrd */ 
32634 +#ifdef CONFIG_BLK_DEV_INITRD
32635 +       if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
32636 +           addr < INITRD_START+INITRD_SIZE) { 
32637 +               *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
32638 +               return 1;
32639 +       } 
32640 +#endif
32641 +       /* kernel code */
32642 +       if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
32643 +               *addrp = PAGE_ALIGN(__pa_symbol(&_end));
32644 +               return 1;
32645 +       }
32646 +
32647 +       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
32648 +               *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
32649 +               return 1;
32650 +       }
32651 +
32652 +       /* XXX ramdisk image here? */ 
32653 +#else
32654 +       if (last < (table_end<<PAGE_SHIFT)) {
32655 +               *addrp = table_end << PAGE_SHIFT;
32656 +               return 1;
32657 +       }
32658 +#endif
32659 +       return 0;
32660 +} 
32661 +
32662 +/*
32663 + * This function checks if any part of the range <start,end> is mapped
32664 + * with type.
32665 + */
32666 +int __meminit
32667 +e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
32668 +{ 
32669 +       int i;
32670 +       for (i = 0; i < e820.nr_map; i++) { 
32671 +               struct e820entry *ei = &e820.map[i]; 
32672 +               if (type && ei->type != type) 
32673 +                       continue;
32674 +               if (ei->addr >= end || ei->addr + ei->size <= start)
32675 +                       continue; 
32676 +               return 1; 
32677 +       } 
32678 +       return 0;
32679 +}
32680 +
32681 +/*
32682 + * This function checks if the entire range <start,end> is mapped with type.
32683 + *
32684 + * Note: this function only works correct if the e820 table is sorted and
32685 + * not-overlapping, which is the case
32686 + */
32687 +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
32688 +{
32689 +       int i;
32690 +       for (i = 0; i < e820.nr_map; i++) {
32691 +               struct e820entry *ei = &e820.map[i];
32692 +               if (type && ei->type != type)
32693 +                       continue;
32694 +               /* is the region (part) in overlap with the current region ?*/
32695 +               if (ei->addr >= end || ei->addr + ei->size <= start)
32696 +                       continue;
32697 +
32698 +               /* if the region is at the beginning of <start,end> we move
32699 +                * start to the end of the region since it's ok until there
32700 +                */
32701 +               if (ei->addr <= start)
32702 +                       start = ei->addr + ei->size;
32703 +               /* if start is now at or beyond end, we're done, full coverage */
32704 +               if (start >= end)
32705 +                       return 1; /* we're done */
32706 +       }
32707 +       return 0;
32708 +}
32709 +
32710 +/* 
32711 + * Find a free area in a specific range. 
32712 + */ 
32713 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
32714 +{ 
32715 +       int i; 
32716 +       for (i = 0; i < e820.nr_map; i++) { 
32717 +               struct e820entry *ei = &e820.map[i]; 
32718 +               unsigned long addr = ei->addr, last; 
32719 +               if (ei->type != E820_RAM) 
32720 +                       continue; 
32721 +               if (addr < start) 
32722 +                       addr = start;
32723 +               if (addr > ei->addr + ei->size) 
32724 +                       continue; 
32725 +               while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
32726 +                       ;
32727 +               last = PAGE_ALIGN(addr) + size;
32728 +               if (last > ei->addr + ei->size)
32729 +                       continue;
32730 +               if (last > end) 
32731 +                       continue;
32732 +               return addr; 
32733 +       } 
32734 +       return -1UL;            
32735 +} 
32736 +
32737 +/*
32738 + * Find the highest page frame number we have available
32739 + */
32740 +unsigned long __init e820_end_of_ram(void)
32741 +{
32742 +       unsigned long end_pfn = 0;
32743 +       end_pfn = find_max_pfn_with_active_regions();
32744 +       
32745 +       if (end_pfn > end_pfn_map) 
32746 +               end_pfn_map = end_pfn;
32747 +       if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
32748 +               end_pfn_map = MAXMEM>>PAGE_SHIFT;
32749 +       if (end_pfn > end_user_pfn)
32750 +               end_pfn = end_user_pfn;
32751 +       if (end_pfn > end_pfn_map) 
32752 +               end_pfn = end_pfn_map; 
32753 +
32754 +       printk("end_pfn_map = %lu\n", end_pfn_map);
32755 +       return end_pfn; 
32756 +}
32757 +
32758 +/*
32759 + * Mark e820 reserved areas as busy for the resource manager.
32760 + */
32761 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
32762 +{
32763 +       int i;
32764 +       for (i = 0; i < nr_map; i++) {
32765 +               struct resource *res;
32766 +               res = alloc_bootmem_low(sizeof(struct resource));
32767 +               switch (e820[i].type) {
32768 +               case E820_RAM:  res->name = "System RAM"; break;
32769 +               case E820_ACPI: res->name = "ACPI Tables"; break;
32770 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
32771 +               default:        res->name = "reserved";
32772 +               }
32773 +               res->start = e820[i].addr;
32774 +               res->end = res->start + e820[i].size - 1;
32775 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
32776 +               request_resource(&iomem_resource, res);
32777 +               if (e820[i].type == E820_RAM) {
32778 +                       /*
32779 +                        *  We don't know which RAM region contains kernel data,
32780 +                        *  so we try it repeatedly and let the resource manager
32781 +                        *  test it.
32782 +                        */
32783 +#ifndef CONFIG_XEN
32784 +                       request_resource(res, &code_resource);
32785 +                       request_resource(res, &data_resource);
32786 +#endif
32787 +#ifdef CONFIG_KEXEC
32788 +                       if (crashk_res.start != crashk_res.end)
32789 +                               request_resource(res, &crashk_res);
32790 +#ifdef CONFIG_XEN
32791 +                       xen_machine_kexec_register_resources(res);
32792 +#endif
32793 +#endif
32794 +               }
32795 +       }
32796 +}
32797 +
32798 +/* Mark pages corresponding to given address range as nosave */
32799 +static void __init
32800 +e820_mark_nosave_range(unsigned long start, unsigned long end)
32801 +{
32802 +       unsigned long pfn, max_pfn;
32803 +
32804 +       if (start >= end)
32805 +               return;
32806 +
32807 +       printk("Nosave address range: %016lx - %016lx\n", start, end);
32808 +       max_pfn = end >> PAGE_SHIFT;
32809 +       for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
32810 +               if (pfn_valid(pfn))
32811 +                       SetPageNosave(pfn_to_page(pfn));
32812 +}
32813 +
32814 +/*
32815 + * Find the ranges of physical addresses that do not correspond to
32816 + * e820 RAM areas and mark the corresponding pages as nosave for software
32817 + * suspend and suspend to RAM.
32818 + *
32819 + * This function requires the e820 map to be sorted and without any
32820 + * overlapping entries and assumes the first e820 area to be RAM.
32821 + */
32822 +void __init e820_mark_nosave_regions(void)
32823 +{
32824 +       int i;
32825 +       unsigned long paddr;
32826 +
32827 +       paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
32828 +       for (i = 1; i < e820.nr_map; i++) {
32829 +               struct e820entry *ei = &e820.map[i];
32830 +
32831 +               if (paddr < ei->addr)
32832 +                       e820_mark_nosave_range(paddr,
32833 +                                       round_up(ei->addr, PAGE_SIZE));
32834 +
32835 +               paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
32836 +               if (ei->type != E820_RAM)
32837 +                       e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
32838 +                                       paddr);
32839 +
32840 +               if (paddr >= (end_pfn << PAGE_SHIFT))
32841 +                       break;
32842 +       }
32843 +}
32844 +
32845 +/* Walk the e820 map and register active regions within a node */
32846 +void __init
32847 +e820_register_active_regions(int nid, unsigned long start_pfn,
32848 +                                                       unsigned long end_pfn)
32849 +{
32850 +       int i;
32851 +       unsigned long ei_startpfn, ei_endpfn;
32852 +
32853 +       for (i = 0; i < e820.nr_map; i++) {
32854 +               struct e820entry *ei = &e820.map[i];
32855 +               ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
32856 +               ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
32857 +                                                               >> PAGE_SHIFT;
32858 +
32859 +               /* Skip map entries smaller than a page */
32860 +               if (ei_startpfn >= ei_endpfn)
32861 +                       continue;
32862 +
32863 +               /* Check if end_pfn_map should be updated */
32864 +               if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
32865 +                       end_pfn_map = ei_endpfn;
32866 +
32867 +               /* Skip if map is outside the node */
32868 +               if (ei->type != E820_RAM ||
32869 +                               ei_endpfn <= start_pfn ||
32870 +                               ei_startpfn >= end_pfn)
32871 +                       continue;
32872 +
32873 +               /* Check for overlaps */
32874 +               if (ei_startpfn < start_pfn)
32875 +                       ei_startpfn = start_pfn;
32876 +               if (ei_endpfn > end_pfn)
32877 +                       ei_endpfn = end_pfn;
32878 +
32879 +               /* Obey end_user_pfn to save on memmap */
32880 +               if (ei_startpfn >= end_user_pfn)
32881 +                       continue;
32882 +               if (ei_endpfn > end_user_pfn)
32883 +                       ei_endpfn = end_user_pfn;
32884 +
32885 +               add_active_range(nid, ei_startpfn, ei_endpfn);
32886 +       }
32887 +}
32888 +
32889 +/* 
32890 + * Add a memory region to the kernel e820 map.
32891 + */ 
32892 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
32893 +{
32894 +       int x = e820.nr_map;
32895 +
32896 +       if (x == E820MAX) {
32897 +               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
32898 +               return;
32899 +       }
32900 +
32901 +       e820.map[x].addr = start;
32902 +       e820.map[x].size = size;
32903 +       e820.map[x].type = type;
32904 +       e820.nr_map++;
32905 +}
32906 +
32907 +void __init e820_print_map(char *who)
32908 +{
32909 +       int i;
32910 +
32911 +       for (i = 0; i < e820.nr_map; i++) {
32912 +               printk(" %s: %016Lx - %016Lx ", who,
32913 +                       (unsigned long long) e820.map[i].addr,
32914 +                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
32915 +               switch (e820.map[i].type) {
32916 +               case E820_RAM:  printk("(usable)\n");
32917 +                               break;
32918 +               case E820_RESERVED:
32919 +                               printk("(reserved)\n");
32920 +                               break;
32921 +               case E820_ACPI:
32922 +                               printk("(ACPI data)\n");
32923 +                               break;
32924 +               case E820_NVS:
32925 +                               printk("(ACPI NVS)\n");
32926 +                               break;
32927 +               default:        printk("type %u\n", e820.map[i].type);
32928 +                               break;
32929 +               }
32930 +       }
32931 +}
32932 +
32933 +/*
32934 + * Sanitize the BIOS e820 map.
32935 + *
32936 + * Some e820 responses include overlapping entries.  The following 
32937 + * replaces the original e820 map with a new one, removing overlaps.
32938 + *
32939 + */
32940 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
32941 +{
32942 +       struct change_member {
32943 +               struct e820entry *pbios; /* pointer to original bios entry */
32944 +               unsigned long long addr; /* address for this change point */
32945 +       };
32946 +       static struct change_member change_point_list[2*E820MAX] __initdata;
32947 +       static struct change_member *change_point[2*E820MAX] __initdata;
32948 +       static struct e820entry *overlap_list[E820MAX] __initdata;
32949 +       static struct e820entry new_bios[E820MAX] __initdata;
32950 +       struct change_member *change_tmp;
32951 +       unsigned long current_type, last_type;
32952 +       unsigned long long last_addr;
32953 +       int chgidx, still_changing;
32954 +       int overlap_entries;
32955 +       int new_bios_entry;
32956 +       int old_nr, new_nr, chg_nr;
32957 +       int i;
32958 +
32959 +       /*
32960 +               Visually we're performing the following (1,2,3,4 = memory types)...
32961 +
32962 +               Sample memory map (w/overlaps):
32963 +                  ____22__________________
32964 +                  ______________________4_
32965 +                  ____1111________________
32966 +                  _44_____________________
32967 +                  11111111________________
32968 +                  ____________________33__
32969 +                  ___________44___________
32970 +                  __________33333_________
32971 +                  ______________22________
32972 +                  ___________________2222_
32973 +                  _________111111111______
32974 +                  _____________________11_
32975 +                  _________________4______
32976 +
32977 +               Sanitized equivalent (no overlap):
32978 +                  1_______________________
32979 +                  _44_____________________
32980 +                  ___1____________________
32981 +                  ____22__________________
32982 +                  ______11________________
32983 +                  _________1______________
32984 +                  __________3_____________
32985 +                  ___________44___________
32986 +                  _____________33_________
32987 +                  _______________2________
32988 +                  ________________1_______
32989 +                  _________________4______
32990 +                  ___________________2____
32991 +                  ____________________33__
32992 +                  ______________________4_
32993 +       */
32994 +
32995 +       /* if there's only one memory region, don't bother */
32996 +       if (*pnr_map < 2)
32997 +               return -1;
32998 +
32999 +       old_nr = *pnr_map;
33000 +
33001 +       /* bail out if we find any unreasonable addresses in bios map */
33002 +       for (i=0; i<old_nr; i++)
33003 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
33004 +                       return -1;
33005 +
33006 +       /* create pointers for initial change-point information (for sorting) */
33007 +       for (i=0; i < 2*old_nr; i++)
33008 +               change_point[i] = &change_point_list[i];
33009 +
33010 +       /* record all known change-points (starting and ending addresses),
33011 +          omitting those that are for empty memory regions */
33012 +       chgidx = 0;
33013 +       for (i=0; i < old_nr; i++)      {
33014 +               if (biosmap[i].size != 0) {
33015 +                       change_point[chgidx]->addr = biosmap[i].addr;
33016 +                       change_point[chgidx++]->pbios = &biosmap[i];
33017 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
33018 +                       change_point[chgidx++]->pbios = &biosmap[i];
33019 +               }
33020 +       }
33021 +       chg_nr = chgidx;
33022 +
33023 +       /* sort change-point list by memory addresses (low -> high) */
33024 +       still_changing = 1;
33025 +       while (still_changing)  {
33026 +               still_changing = 0;
33027 +               for (i=1; i < chg_nr; i++)  {
33028 +                       /* if <current_addr> > <last_addr>, swap */
33029 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
33030 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
33031 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
33032 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
33033 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
33034 +                          )
33035 +                       {
33036 +                               change_tmp = change_point[i];
33037 +                               change_point[i] = change_point[i-1];
33038 +                               change_point[i-1] = change_tmp;
33039 +                               still_changing=1;
33040 +                       }
33041 +               }
33042 +       }
33043 +
33044 +       /* create a new bios memory map, removing overlaps */
33045 +       overlap_entries=0;       /* number of entries in the overlap table */
33046 +       new_bios_entry=0;        /* index for creating new bios map entries */
33047 +       last_type = 0;           /* start with undefined memory type */
33048 +       last_addr = 0;           /* start with 0 as last starting address */
33049 +       /* loop through change-points, determining affect on the new bios map */
33050 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
33051 +       {
33052 +               /* keep track of all overlapping bios entries */
33053 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
33054 +               {
33055 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
33056 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
33057 +               }
33058 +               else
33059 +               {
33060 +                       /* remove entry from list (order independent, so swap with last) */
33061 +                       for (i=0; i<overlap_entries; i++)
33062 +                       {
33063 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
33064 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
33065 +                       }
33066 +                       overlap_entries--;
33067 +               }
33068 +               /* if there are overlapping entries, decide which "type" to use */
33069 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
33070 +               current_type = 0;
33071 +               for (i=0; i<overlap_entries; i++)
33072 +                       if (overlap_list[i]->type > current_type)
33073 +                               current_type = overlap_list[i]->type;
33074 +               /* continue building up new bios map based on this information */
33075 +               if (current_type != last_type)  {
33076 +                       if (last_type != 0)      {
33077 +                               new_bios[new_bios_entry].size =
33078 +                                       change_point[chgidx]->addr - last_addr;
33079 +                               /* move forward only if the new size was non-zero */
33080 +                               if (new_bios[new_bios_entry].size != 0)
33081 +                                       if (++new_bios_entry >= E820MAX)
33082 +                                               break;  /* no more space left for new bios entries */
33083 +                       }
33084 +                       if (current_type != 0)  {
33085 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
33086 +                               new_bios[new_bios_entry].type = current_type;
33087 +                               last_addr=change_point[chgidx]->addr;
33088 +                       }
33089 +                       last_type = current_type;
33090 +               }
33091 +       }
33092 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
33093 +
33094 +       /* copy new bios mapping into original location */
33095 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
33096 +       *pnr_map = new_nr;
33097 +
33098 +       return 0;
33099 +}
33100 +
33101 +/*
33102 + * Copy the BIOS e820 map into a safe place.
33103 + *
33104 + * Sanity-check it while we're at it..
33105 + *
33106 + * If we're lucky and live on a modern system, the setup code
33107 + * will have given us a memory map that we can use to properly
33108 + * set up memory.  If we aren't, we'll fake a memory map.
33109 + */
33110 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
33111 +{
33112 +#ifndef CONFIG_XEN
33113 +       /* Only one memory region (or negative)? Ignore it */
33114 +       if (nr_map < 2)
33115 +               return -1;
33116 +#else
33117 +       BUG_ON(nr_map < 1);
33118 +#endif
33119 +
33120 +       do {
33121 +               unsigned long start = biosmap->addr;
33122 +               unsigned long size = biosmap->size;
33123 +               unsigned long end = start + size;
33124 +               unsigned long type = biosmap->type;
33125 +
33126 +               /* Overflow in 64 bits? Ignore the memory map. */
33127 +               if (start > end)
33128 +                       return -1;
33129 +
33130 +               add_memory_region(start, size, type);
33131 +       } while (biosmap++,--nr_map);
33132 +       return 0;
33133 +}
33134 +
33135 +void early_panic(char *msg)
33136 +{
33137 +       early_printk(msg);
33138 +       panic(msg);
33139 +}
33140 +
33141 +void __init setup_memory_region(void)
33142 +{
33143 +#ifndef CONFIG_XEN
33144 +       /*
33145 +        * Try to copy the BIOS-supplied E820-map.
33146 +        *
33147 +        * Otherwise fake a memory map; one section from 0k->640k,
33148 +        * the next section from 1mb->appropriate_mem_k
33149 +        */
33150 +       sanitize_e820_map(E820_MAP, &E820_MAP_NR);
33151 +       if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
33152 +               early_panic("Cannot find a valid memory map");
33153 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
33154 +       e820_print_map("BIOS-e820");
33155 +#else  /* CONFIG_XEN */
33156 +       int rc;
33157 +       struct xen_memory_map memmap;
33158 +       /*
33159 +        * This is rather large for a stack variable but this early in
33160 +        * the boot process we know we have plenty slack space.
33161 +        */
33162 +       struct e820entry map[E820MAX];
33163 +
33164 +       memmap.nr_entries = E820MAX;
33165 +       set_xen_guest_handle(memmap.buffer, map);
33166 +
33167 +       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
33168 +       if ( rc == -ENOSYS ) {
33169 +               memmap.nr_entries = 1;
33170 +               map[0].addr = 0ULL;
33171 +               map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
33172 +               /* 8MB slack (to balance backend allocations). */
33173 +               map[0].size += 8 << 20;
33174 +               map[0].type = E820_RAM;
33175 +               rc = 0;
33176 +       }
33177 +       BUG_ON(rc);
33178 +
33179 +       sanitize_e820_map(map, (char *)&memmap.nr_entries);
33180 +       if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
33181 +               early_panic("Cannot find a valid memory map");
33182 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
33183 +       e820_print_map("Xen");
33184 +#endif
33185 +}
33186 +
33187 +static int __init parse_memopt(char *p)
33188 +{ 
33189 +       int i;
33190 +       unsigned long current_end;
33191 +       unsigned long end;
33192 +
33193 +       if (!p)
33194 +               return -EINVAL;
33195 +       end_user_pfn = memparse(p, &p);
33196 +       end_user_pfn >>= PAGE_SHIFT;    
33197 +
33198 +       end = end_user_pfn<<PAGE_SHIFT;
33199 +       i = e820.nr_map-1;
33200 +       current_end = e820.map[i].addr + e820.map[i].size;
33201 +
33202 +       if (current_end < end) {
33203 +               /*
33204 +                 * The e820 map ends before our requested size so
33205 +                 * extend the final entry to the requested address.
33206 +                 */
33207 +               if (e820.map[i].type == E820_RAM)
33208 +                       e820.map[i].size = end - e820.map[i].addr;
33209 +               else
33210 +                       add_memory_region(current_end, end - current_end, E820_RAM);
33211 +       }
33212 +       return 0;
33213 +} 
33214 +early_param("mem", parse_memopt);
33215 +
33216 +static int userdef __initdata;
33217 +
33218 +static int __init parse_memmap_opt(char *p)
33219 +{
33220 +       char *oldp;
33221 +       unsigned long long start_at, mem_size;
33222 +
33223 +       if (!strcmp(p, "exactmap")) {
33224 +#ifdef CONFIG_CRASH_DUMP
33225 +               /* If we are doing a crash dump, we
33226 +                * still need to know the real mem
33227 +                * size before original memory map is
33228 +                * reset.
33229 +                */
33230 +               e820_register_active_regions(0, 0, -1UL);
33231 +               saved_max_pfn = e820_end_of_ram();
33232 +               remove_all_active_ranges();
33233 +#endif
33234 +               end_pfn_map = 0;
33235 +               e820.nr_map = 0;
33236 +               userdef = 1;
33237 +               return 0;
33238 +       }
33239 +
33240 +       oldp = p;
33241 +       mem_size = memparse(p, &p);
33242 +       if (p == oldp)
33243 +               return -EINVAL;
33244 +       if (*p == '@') {
33245 +               start_at = memparse(p+1, &p);
33246 +               add_memory_region(start_at, mem_size, E820_RAM);
33247 +       } else if (*p == '#') {
33248 +               start_at = memparse(p+1, &p);
33249 +               add_memory_region(start_at, mem_size, E820_ACPI);
33250 +       } else if (*p == '$') {
33251 +               start_at = memparse(p+1, &p);
33252 +               add_memory_region(start_at, mem_size, E820_RESERVED);
33253 +       } else {
33254 +               end_user_pfn = (mem_size >> PAGE_SHIFT);
33255 +       }
33256 +       return *p == '\0' ? 0 : -EINVAL;
33257 +}
33258 +early_param("memmap", parse_memmap_opt);
33259 +
33260 +void finish_e820_parsing(void)
33261 +{
33262 +       if (userdef) {
33263 +               printk(KERN_INFO "user-defined physical RAM map:\n");
33264 +               e820_print_map("user");
33265 +       }
33266 +}
33267 +
33268 +unsigned long pci_mem_start = 0xaeedbabe;
33269 +EXPORT_SYMBOL(pci_mem_start);
33270 +
33271 +/*
33272 + * Search for the biggest gap in the low 32 bits of the e820
33273 + * memory space.  We pass this space to PCI to assign MMIO resources
33274 + * for hotplug or unconfigured devices in.
33275 + * Hopefully the BIOS let enough space left.
33276 + */
33277 +__init void e820_setup_gap(struct e820entry *e820, int nr_map)
33278 +{
33279 +       unsigned long gapstart, gapsize, round;
33280 +       unsigned long last;
33281 +       int i;
33282 +       int found = 0;
33283 +
33284 +       last = 0x100000000ull;
33285 +       gapstart = 0x10000000;
33286 +       gapsize = 0x400000;
33287 +       i = nr_map;
33288 +       while (--i >= 0) {
33289 +               unsigned long long start = e820[i].addr;
33290 +               unsigned long long end = start + e820[i].size;
33291 +
33292 +               /*
33293 +                * Since "last" is at most 4GB, we know we'll
33294 +                * fit in 32 bits if this condition is true
33295 +                */
33296 +               if (last > end) {
33297 +                       unsigned long gap = last - end;
33298 +
33299 +                       if (gap > gapsize) {
33300 +                               gapsize = gap;
33301 +                               gapstart = end;
33302 +                               found = 1;
33303 +                       }
33304 +               }
33305 +               if (start < last)
33306 +                       last = start;
33307 +       }
33308 +
33309 +       if (!found) {
33310 +               gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
33311 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
33312 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
33313 +       }
33314 +
33315 +       /*
33316 +        * See how much we want to round up: start off with
33317 +        * rounding to the next 1MB area.
33318 +        */
33319 +       round = 0x100000;
33320 +       while ((gapsize >> 4) > round)
33321 +               round += round;
33322 +       /* Fun with two's complement */
33323 +       pci_mem_start = (gapstart + round) & -round;
33324 +
33325 +       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
33326 +               pci_mem_start, gapstart, gapsize);
33327 +}
33328 diff -ruNp linux-2.6.19/arch/x86_64/kernel/early_printk-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/early_printk-xen.c
33329 --- linux-2.6.19/arch/x86_64/kernel/early_printk-xen.c  1970-01-01 00:00:00.000000000 +0000
33330 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/early_printk-xen.c        2007-02-02 19:10:26.000000000 +0000
33331 @@ -0,0 +1,299 @@
33332 +#include <linux/console.h>
33333 +#include <linux/kernel.h>
33334 +#include <linux/init.h>
33335 +#include <linux/string.h>
33336 +#include <linux/screen_info.h>
33337 +#include <asm/io.h>
33338 +#include <asm/processor.h>
33339 +#include <asm/fcntl.h>
33340 +
33341 +/* Simple VGA output */
33342 +
33343 +#ifdef __i386__
33344 +#include <asm/setup.h>
33345 +#define VGABASE                (__ISA_IO_base + 0xb8000)
33346 +#else
33347 +#include <asm/bootsetup.h>
33348 +#define VGABASE                ((void __iomem *)0xffffffff800b8000UL)
33349 +#endif
33350 +
33351 +static int max_ypos = 25, max_xpos = 80;
33352 +static int current_ypos = 25, current_xpos = 0;
33353 +
33354 +#ifndef CONFIG_XEN
33355 +static void early_vga_write(struct console *con, const char *str, unsigned n)
33356 +{
33357 +       char c;
33358 +       int  i, k, j;
33359 +
33360 +       while ((c = *str++) != '\0' && n-- > 0) {
33361 +               if (current_ypos >= max_ypos) {
33362 +                       /* scroll 1 line up */
33363 +                       for (k = 1, j = 0; k < max_ypos; k++, j++) {
33364 +                               for (i = 0; i < max_xpos; i++) {
33365 +                                       writew(readw(VGABASE+2*(max_xpos*k+i)),
33366 +                                              VGABASE + 2*(max_xpos*j + i));
33367 +                               }
33368 +                       }
33369 +                       for (i = 0; i < max_xpos; i++)
33370 +                               writew(0x720, VGABASE + 2*(max_xpos*j + i));
33371 +                       current_ypos = max_ypos-1;
33372 +               }
33373 +               if (c == '\n') {
33374 +                       current_xpos = 0;
33375 +                       current_ypos++;
33376 +               } else if (c != '\r')  {
33377 +                       writew(((0x7 << 8) | (unsigned short) c),
33378 +                              VGABASE + 2*(max_xpos*current_ypos +
33379 +                                               current_xpos++));
33380 +                       if (current_xpos >= max_xpos) {
33381 +                               current_xpos = 0;
33382 +                               current_ypos++;
33383 +                       }
33384 +               }
33385 +       }
33386 +}
33387 +
33388 +static struct console early_vga_console = {
33389 +       .name =         "earlyvga",
33390 +       .write =        early_vga_write,
33391 +       .flags =        CON_PRINTBUFFER,
33392 +       .index =        -1,
33393 +};
33394 +
33395 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
33396 +
33397 +static int early_serial_base = 0x3f8;  /* ttyS0 */
33398 +
33399 +#define XMTRDY          0x20
33400 +
33401 +#define DLAB           0x80
33402 +
33403 +#define TXR             0       /*  Transmit register (WRITE) */
33404 +#define RXR             0       /*  Receive register  (READ)  */
33405 +#define IER             1       /*  Interrupt Enable          */
33406 +#define IIR             2       /*  Interrupt ID              */
33407 +#define FCR             2       /*  FIFO control              */
33408 +#define LCR             3       /*  Line control              */
33409 +#define MCR             4       /*  Modem control             */
33410 +#define LSR             5       /*  Line Status               */
33411 +#define MSR             6       /*  Modem Status              */
33412 +#define DLL             0       /*  Divisor Latch Low         */
33413 +#define DLH             1       /*  Divisor latch High        */
33414 +
33415 +static int early_serial_putc(unsigned char ch)
33416 +{
33417 +       unsigned timeout = 0xffff;
33418 +       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
33419 +               cpu_relax();
33420 +       outb(ch, early_serial_base + TXR);
33421 +       return timeout ? 0 : -1;
33422 +}
33423 +
33424 +static void early_serial_write(struct console *con, const char *s, unsigned n)
33425 +{
33426 +       while (*s && n-- > 0) {
33427 +               early_serial_putc(*s);
33428 +               if (*s == '\n')
33429 +                       early_serial_putc('\r');
33430 +               s++;
33431 +       }
33432 +}
33433 +
33434 +#define DEFAULT_BAUD 9600
33435 +
33436 +static __init void early_serial_init(char *s)
33437 +{
33438 +       unsigned char c;
33439 +       unsigned divisor;
33440 +       unsigned baud = DEFAULT_BAUD;
33441 +       char *e;
33442 +
33443 +       if (*s == ',')
33444 +               ++s;
33445 +
33446 +       if (*s) {
33447 +               unsigned port;
33448 +               if (!strncmp(s,"0x",2)) {
33449 +                       early_serial_base = simple_strtoul(s, &e, 16);
33450 +               } else {
33451 +                       static int bases[] = { 0x3f8, 0x2f8 };
33452 +
33453 +                       if (!strncmp(s,"ttyS",4))
33454 +                               s += 4;
33455 +                       port = simple_strtoul(s, &e, 10);
33456 +                       if (port > 1 || s == e)
33457 +                               port = 0;
33458 +                       early_serial_base = bases[port];
33459 +               }
33460 +               s += strcspn(s, ",");
33461 +               if (*s == ',')
33462 +                       s++;
33463 +       }
33464 +
33465 +       outb(0x3, early_serial_base + LCR);     /* 8n1 */
33466 +       outb(0, early_serial_base + IER);       /* no interrupt */
33467 +       outb(0, early_serial_base + FCR);       /* no fifo */
33468 +       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
33469 +
33470 +       if (*s) {
33471 +               baud = simple_strtoul(s, &e, 0);
33472 +               if (baud == 0 || s == e)
33473 +                       baud = DEFAULT_BAUD;
33474 +       }
33475 +
33476 +       divisor = 115200 / baud;
33477 +       c = inb(early_serial_base + LCR);
33478 +       outb(c | DLAB, early_serial_base + LCR);
33479 +       outb(divisor & 0xff, early_serial_base + DLL);
33480 +       outb((divisor >> 8) & 0xff, early_serial_base + DLH);
33481 +       outb(c & ~DLAB, early_serial_base + LCR);
33482 +}
33483 +
33484 +#else /* CONFIG_XEN */
33485 +
33486 +#undef SCREEN_INFO
33487 +#define SCREEN_INFO screen_info
33488 +extern struct screen_info screen_info;
33489 +
33490 +static void
33491 +early_serial_write(struct console *con, const char *s, unsigned count)
33492 +{
33493 +       int n;
33494 +
33495 +       while (count > 0) {
33496 +               n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
33497 +               if (n <= 0)
33498 +                       break;
33499 +               count -= n;
33500 +               s += n;
33501 +       }
33502 +} 
33503 +
33504 +static __init void early_serial_init(char *s)
33505 +{
33506 +       current_xpos = 0;
33507 +}
33508 +
33509 +/*
33510 + * No early VGA console on Xen, as we do not have convenient ISA-space
33511 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
33512 + */
33513 +#define early_vga_console early_serial_console
33514 +
33515 +#endif
33516 +
33517 +static struct console early_serial_console = {
33518 +       .name =         "earlyser",
33519 +       .write =        early_serial_write,
33520 +       .flags =        CON_PRINTBUFFER,
33521 +       .index =        -1,
33522 +};
33523 +
33524 +/* Console interface to a host file on AMD's SimNow! */
33525 +
33526 +static int simnow_fd;
33527 +
33528 +enum {
33529 +       MAGIC1 = 0xBACCD00A,
33530 +       MAGIC2 = 0xCA110000,
33531 +       XOPEN = 5,
33532 +       XWRITE = 4,
33533 +};
33534 +
33535 +static noinline long simnow(long cmd, long a, long b, long c)
33536 +{
33537 +       long ret;
33538 +       asm volatile("cpuid" :
33539 +                    "=a" (ret) :
33540 +                    "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
33541 +       return ret;
33542 +}
33543 +
33544 +void __init simnow_init(char *str)
33545 +{
33546 +       char *fn = "klog";
33547 +       if (*str == '=')
33548 +               fn = ++str;
33549 +       /* error ignored */
33550 +       simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
33551 +}
33552 +
33553 +static void simnow_write(struct console *con, const char *s, unsigned n)
33554 +{
33555 +       simnow(XWRITE, simnow_fd, (unsigned long)s, n);
33556 +}
33557 +
33558 +static struct console simnow_console = {
33559 +       .name =         "simnow",
33560 +       .write =        simnow_write,
33561 +       .flags =        CON_PRINTBUFFER,
33562 +       .index =        -1,
33563 +};
33564 +
33565 +/* Direct interface for emergencies */
33566 +struct console *early_console = &early_vga_console;
33567 +static int early_console_initialized = 0;
33568 +
33569 +void early_printk(const char *fmt, ...)
33570 +{
33571 +       char buf[512];
33572 +       int n;
33573 +       va_list ap;
33574 +
33575 +       va_start(ap,fmt);
33576 +       n = vscnprintf(buf,512,fmt,ap);
33577 +       early_console->write(early_console,buf,n);
33578 +       va_end(ap);
33579 +}
33580 +
33581 +static int __initdata keep_early;
33582 +
33583 +static int __init setup_early_printk(char *buf)
33584 +{
33585 +       if (!buf)
33586 +               return 0;
33587 +
33588 +       if (early_console_initialized)
33589 +               return 0;
33590 +       early_console_initialized = 1;
33591 +
33592 +       if (strstr(buf, "keep"))
33593 +               keep_early = 1;
33594 +
33595 +       if (!strncmp(buf, "serial", 6)) {
33596 +               early_serial_init(buf + 6);
33597 +               early_console = &early_serial_console;
33598 +       } else if (!strncmp(buf, "ttyS", 4)) {
33599 +               early_serial_init(buf);
33600 +               early_console = &early_serial_console;
33601 +       } else if (!strncmp(buf, "vga", 3)
33602 +                  && SCREEN_INFO.orig_video_isVGA == 1) {
33603 +               max_xpos = SCREEN_INFO.orig_video_cols;
33604 +               max_ypos = SCREEN_INFO.orig_video_lines;
33605 +               current_ypos = SCREEN_INFO.orig_y;
33606 +               early_console = &early_vga_console;
33607 +       } else if (!strncmp(buf, "simnow", 6)) {
33608 +               simnow_init(buf + 6);
33609 +               early_console = &simnow_console;
33610 +               keep_early = 1;
33611 +       }
33612 +       register_console(early_console);
33613 +       return 0;
33614 +}
33615 +
33616 +early_param("earlyprintk", setup_early_printk);
33617 +
33618 +void __init disable_early_printk(void)
33619 +{
33620 +       if (!early_console_initialized || !early_console)
33621 +               return;
33622 +       if (!keep_early) {
33623 +               printk("disabling early console\n");
33624 +               unregister_console(early_console);
33625 +               early_console_initialized = 0;
33626 +       } else {
33627 +               printk("keeping early console\n");
33628 +       }
33629 +}
33630 +
33631 diff -ruNp linux-2.6.19/arch/x86_64/kernel/entry-xen.S linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/entry-xen.S
33632 --- linux-2.6.19/arch/x86_64/kernel/entry-xen.S 1970-01-01 00:00:00.000000000 +0000
33633 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/entry-xen.S       2007-02-02 19:10:26.000000000 +0000
33634 @@ -0,0 +1,1217 @@
33635 +/*
33636 + *  linux/arch/x86_64/entry.S
33637 + *
33638 + *  Copyright (C) 1991, 1992  Linus Torvalds
33639 + *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
33640 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
33641 + * 
33642 + *  Jun Nakajima <jun.nakajima@intel.com>
33643 + *  Asit Mallick <asit.k.mallick@intel.com>
33644 + *      Modified for Xen
33645 + */
33646 +
33647 +/*
33648 + * entry.S contains the system-call and fault low-level handling routines.
33649 + *
33650 + * NOTE: This code handles signal-recognition, which happens every time
33651 + * after an interrupt and after each system call.
33652 + * 
33653 + * Normal syscalls and interrupts don't save a full stack frame, this is 
33654 + * only done for syscall tracing, signals or fork/exec et.al.
33655 + * 
33656 + * A note on terminology:       
33657 + * - top of stack: Architecture defined interrupt frame from SS to RIP 
33658 + * at the top of the kernel process stack.     
33659 + * - partial stack frame: partially saved registers upto R11.
33660 + * - full stack frame: Like partial stack frame, but all register saved. 
33661 + *
33662 + * Some macro usage:
33663 + * - CFI macros are used to generate dwarf2 unwind information for better
33664 + * backtraces. They don't change any code.
33665 + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
33666 + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
33667 + * There are unfortunately lots of special cases where some registers
33668 + * not touched. The macro is a big mess that should be cleaned up.
33669 + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
33670 + * Gives a full stack frame.
33671 + * - ENTRY/END Define functions in the symbol table.
33672 + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
33673 + * frame that is otherwise undefined after a SYSCALL
33674 + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
33675 + * - errorentry/paranoidentry/zeroentry - Define exception entry points.
33676 + */
33677 +
33678 +#include <linux/linkage.h>
33679 +#include <asm/segment.h>
33680 +#include <asm/cache.h>
33681 +#include <asm/errno.h>
33682 +#include <asm/dwarf2.h>
33683 +#include <asm/calling.h>
33684 +#include <asm/asm-offsets.h>
33685 +#include <asm/msr.h>
33686 +#include <asm/unistd.h>
33687 +#include <asm/thread_info.h>
33688 +#include <asm/hw_irq.h>
33689 +#include <asm/page.h>
33690 +#include <asm/irqflags.h>
33691 +#include <asm/errno.h>
33692 +#include <xen/interface/arch-x86_64.h>
33693 +#include <xen/interface/features.h>
33694 +
33695 +#include "irq_vectors.h"
33696 +
33697 +#include "xen_entry.S"
33698 +
33699 +       .code64
33700 +
33701 +#ifndef CONFIG_PREEMPT
33702 +#define retint_kernel retint_restore_args
33703 +#endif 
33704 +
33705 +
33706 +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
33707 +#ifdef CONFIG_TRACE_IRQFLAGS
33708 +       bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
33709 +       jnc  1f
33710 +       TRACE_IRQS_ON
33711 +1:
33712 +#endif
33713 +.endm
33714 +
33715 +NMI_MASK = 0x80000000
33716 +
33717 +/*
33718 + * C code is not supposed to know about undefined top of stack. Every time 
33719 + * a C function with an pt_regs argument is called from the SYSCALL based 
33720 + * fast path FIXUP_TOP_OF_STACK is needed.
33721 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
33722 + * manipulation.
33723 + */            
33724 +               
33725 +       /* %rsp:at FRAMEEND */ 
33726 +       .macro FIXUP_TOP_OF_STACK tmp
33727 +       movq    $__USER_CS,CS(%rsp)
33728 +       movq    $-1,RCX(%rsp)
33729 +       .endm
33730 +
33731 +       .macro RESTORE_TOP_OF_STACK tmp,offset=0
33732 +       .endm
33733 +
33734 +       .macro FAKE_STACK_FRAME child_rip
33735 +       /* push in order ss, rsp, eflags, cs, rip */
33736 +       xorl %eax, %eax
33737 +       pushq %rax /* ss */
33738 +       CFI_ADJUST_CFA_OFFSET   8
33739 +       /*CFI_REL_OFFSET        ss,0*/
33740 +       pushq %rax /* rsp */
33741 +       CFI_ADJUST_CFA_OFFSET   8
33742 +       CFI_REL_OFFSET  rsp,0
33743 +       pushq $(1<<9) /* eflags - interrupts on */
33744 +       CFI_ADJUST_CFA_OFFSET   8
33745 +       /*CFI_REL_OFFSET        rflags,0*/
33746 +       pushq $__KERNEL_CS /* cs */
33747 +       CFI_ADJUST_CFA_OFFSET   8
33748 +       /*CFI_REL_OFFSET        cs,0*/
33749 +       pushq \child_rip /* rip */
33750 +       CFI_ADJUST_CFA_OFFSET   8
33751 +       CFI_REL_OFFSET  rip,0
33752 +       pushq   %rax /* orig rax */
33753 +       CFI_ADJUST_CFA_OFFSET   8
33754 +       .endm
33755 +
33756 +       .macro UNFAKE_STACK_FRAME
33757 +       addq $8*6, %rsp
33758 +       CFI_ADJUST_CFA_OFFSET   -(6*8)
33759 +       .endm
33760 +
33761 +       .macro  CFI_DEFAULT_STACK start=1
33762 +       .if \start
33763 +       CFI_STARTPROC   simple
33764 +       CFI_SIGNAL_FRAME
33765 +       CFI_DEF_CFA     rsp,SS+8
33766 +       .else
33767 +       CFI_DEF_CFA_OFFSET SS+8
33768 +       .endif
33769 +       CFI_REL_OFFSET  r15,R15
33770 +       CFI_REL_OFFSET  r14,R14
33771 +       CFI_REL_OFFSET  r13,R13
33772 +       CFI_REL_OFFSET  r12,R12
33773 +       CFI_REL_OFFSET  rbp,RBP
33774 +       CFI_REL_OFFSET  rbx,RBX
33775 +       CFI_REL_OFFSET  r11,R11
33776 +       CFI_REL_OFFSET  r10,R10
33777 +       CFI_REL_OFFSET  r9,R9
33778 +       CFI_REL_OFFSET  r8,R8
33779 +       CFI_REL_OFFSET  rax,RAX
33780 +       CFI_REL_OFFSET  rcx,RCX
33781 +       CFI_REL_OFFSET  rdx,RDX
33782 +       CFI_REL_OFFSET  rsi,RSI
33783 +       CFI_REL_OFFSET  rdi,RDI
33784 +       CFI_REL_OFFSET  rip,RIP
33785 +       /*CFI_REL_OFFSET        cs,CS*/
33786 +       /*CFI_REL_OFFSET        rflags,EFLAGS*/
33787 +       CFI_REL_OFFSET  rsp,RSP
33788 +       /*CFI_REL_OFFSET        ss,SS*/
33789 +       .endm
33790 +
33791 +       /*
33792 +        * Must be consistent with the definition in arch-x86_64.h:    
33793 +        *     struct iret_context {
33794 +        *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
33795 +        *     };
33796 +        * #define VGCF_IN_SYSCALL (1<<8) 
33797 +        */
33798 +       .macro HYPERVISOR_IRET flag
33799 +       testb $3,1*8(%rsp)
33800 +       jnz   2f
33801 +       testl $NMI_MASK,2*8(%rsp)
33802 +       jnz   2f
33803 +
33804 +       testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
33805 +       jnz   1f
33806 +
33807 +       /* Direct iret to kernel space. Correct CS and SS. */
33808 +       orb   $3,1*8(%rsp)
33809 +       orb   $3,4*8(%rsp)
33810 +1:     iretq
33811 +
33812 +2:     /* Slow iret via hypervisor. */
33813 +       andl  $~NMI_MASK, 16(%rsp)
33814 +       pushq $\flag
33815 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
33816 +       .endm
33817 +
33818 +       .macro SWITCH_TO_KERNEL ssoff,adjust=0
33819 +       jc  1f
33820 +       orb  $1,\ssoff-\adjust+4(%rsp)
33821 +1:
33822 +       .endm
33823 +
33824 +/*
33825 + * A newly forked process directly context switches into this.
33826 + */    
33827 +/* rdi:        prev */ 
33828 +ENTRY(ret_from_fork)
33829 +       CFI_DEFAULT_STACK
33830 +       push kernel_eflags(%rip)
33831 +       CFI_ADJUST_CFA_OFFSET 4
33832 +       popf                            # reset kernel eflags
33833 +       CFI_ADJUST_CFA_OFFSET -4
33834 +       call schedule_tail
33835 +       GET_THREAD_INFO(%rcx)
33836 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
33837 +       jnz rff_trace
33838 +rff_action:    
33839 +       RESTORE_REST
33840 +       testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
33841 +       je   int_ret_from_sys_call
33842 +       testl $_TIF_IA32,threadinfo_flags(%rcx)
33843 +       jnz  int_ret_from_sys_call
33844 +       RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
33845 +       jmp ret_from_sys_call
33846 +rff_trace:
33847 +       movq %rsp,%rdi
33848 +       call syscall_trace_leave
33849 +       GET_THREAD_INFO(%rcx)   
33850 +       jmp rff_action
33851 +       CFI_ENDPROC
33852 +END(ret_from_fork)
33853 +
33854 +/*
33855 + * System call entry. Upto 6 arguments in registers are supported.
33856 + *
33857 + * SYSCALL does not save anything on the stack and does not change the
33858 + * stack pointer.
33859 + */
33860 +               
33861 +/*
33862 + * Register setup:     
33863 + * rax  system call number
33864 + * rdi  arg0
33865 + * rcx  return address for syscall/sysret, C arg3 
33866 + * rsi  arg1
33867 + * rdx  arg2   
33868 + * r10  arg3   (--> moved to rcx for C)
33869 + * r8   arg4
33870 + * r9   arg5
33871 + * r11  eflags for syscall/sysret, temporary for C
33872 + * r12-r15,rbp,rbx saved by C code, not touched.               
33873 + * 
33874 + * Interrupts are off on entry.
33875 + * Only called from user space.
33876 + *
33877 + * XXX if we had a free scratch register we could save the RSP into the stack frame
33878 + *      and report it properly in ps. Unfortunately we haven't.
33879 + *
33880 + * When user can change the frames always force IRET. That is because
33881 + * it deals with uncanonical addresses better. SYSRET has trouble
33882 + * with them due to bugs in both AMD and Intel CPUs.
33883 + */                                    
33884 +
33885 +ENTRY(system_call)
33886 +       CFI_STARTPROC   simple
33887 +       CFI_SIGNAL_FRAME
33888 +       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
33889 +       CFI_REGISTER    rip,rcx
33890 +       /*CFI_REGISTER  rflags,r11*/
33891 +       SAVE_ARGS -8,0
33892 +       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
33893 +       /*
33894 +        * No need to follow this irqs off/on section - it's straight
33895 +        * and short:
33896 +        */
33897 +       XEN_UNBLOCK_EVENTS(%r11)
33898 +       GET_THREAD_INFO(%rcx)
33899 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
33900 +       CFI_REMEMBER_STATE
33901 +       jnz tracesys
33902 +       cmpq $__NR_syscall_max,%rax
33903 +       ja badsys
33904 +       movq %r10,%rcx
33905 +       call *sys_call_table(,%rax,8)  # XXX:    rip relative
33906 +       movq %rax,RAX-ARGOFFSET(%rsp)
33907 +/*
33908 + * Syscall return path ending with SYSRET (fast path)
33909 + * Has incomplete stack frame and undefined top of stack. 
33910 + */            
33911 +       .globl ret_from_sys_call
33912 +ret_from_sys_call:
33913 +       movl $_TIF_ALLWORK_MASK,%edi
33914 +       /* edi: flagmask */
33915 +sysret_check:          
33916 +       GET_THREAD_INFO(%rcx)
33917 +       XEN_BLOCK_EVENTS(%rsi)
33918 +       TRACE_IRQS_OFF
33919 +       movl threadinfo_flags(%rcx),%edx
33920 +       andl %edi,%edx
33921 +       CFI_REMEMBER_STATE
33922 +       jnz  sysret_careful 
33923 +       /*
33924 +        * sysretq will re-enable interrupts:
33925 +        */
33926 +       TRACE_IRQS_ON
33927 +       XEN_UNBLOCK_EVENTS(%rsi)
33928 +       CFI_REGISTER    rip,rcx
33929 +       RESTORE_ARGS 0,8,0
33930 +       /*CFI_REGISTER  rflags,r11*/
33931 +       HYPERVISOR_IRET VGCF_IN_SYSCALL
33932 +
33933 +       /* Handle reschedules */
33934 +       /* edx: work, edi: workmask */  
33935 +sysret_careful:
33936 +       CFI_RESTORE_STATE
33937 +       bt $TIF_NEED_RESCHED,%edx
33938 +       jnc sysret_signal
33939 +       TRACE_IRQS_ON
33940 +       XEN_UNBLOCK_EVENTS(%rsi)
33941 +       pushq %rdi
33942 +       CFI_ADJUST_CFA_OFFSET 8
33943 +       call schedule
33944 +       popq  %rdi
33945 +       CFI_ADJUST_CFA_OFFSET -8
33946 +       jmp sysret_check
33947 +
33948 +       /* Handle a signal */ 
33949 +sysret_signal:
33950 +       TRACE_IRQS_ON
33951 +       XEN_UNBLOCK_EVENTS(%rsi)
33952 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
33953 +       jz    1f
33954 +
33955 +       /* Really a signal */
33956 +       /* edx: work flags (arg3) */
33957 +       leaq do_notify_resume(%rip),%rax
33958 +       leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
33959 +       xorl %esi,%esi # oldset -> arg2
33960 +       call ptregscall_common
33961 +1:     movl $_TIF_NEED_RESCHED,%edi
33962 +       /* Use IRET because user could have changed frame. This
33963 +          works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
33964 +       cli
33965 +       TRACE_IRQS_OFF
33966 +       jmp int_with_check
33967 +       
33968 +badsys:
33969 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
33970 +       jmp ret_from_sys_call
33971 +
33972 +       /* Do syscall tracing */
33973 +tracesys:                       
33974 +       CFI_RESTORE_STATE
33975 +       SAVE_REST
33976 +       movq $-ENOSYS,RAX(%rsp)
33977 +       FIXUP_TOP_OF_STACK %rdi
33978 +       movq %rsp,%rdi
33979 +       call syscall_trace_enter
33980 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
33981 +       RESTORE_REST
33982 +       cmpq $__NR_syscall_max,%rax
33983 +       movq $-ENOSYS,%rcx
33984 +       cmova %rcx,%rax
33985 +       ja  1f
33986 +       movq %r10,%rcx  /* fixup for C */
33987 +       call *sys_call_table(,%rax,8)
33988 +1:     movq %rax,RAX-ARGOFFSET(%rsp)
33989 +       /* Use IRET because user could have changed frame */
33990 +       jmp int_ret_from_sys_call
33991 +       CFI_ENDPROC
33992 +END(system_call)
33993 +               
33994 +/* 
33995 + * Syscall return path ending with IRET.
33996 + * Has correct top of stack, but partial stack frame.
33997 + */    
33998 +ENTRY(int_ret_from_sys_call)
33999 +       CFI_STARTPROC   simple
34000 +       CFI_SIGNAL_FRAME
34001 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
34002 +       /*CFI_REL_OFFSET        ss,SS-ARGOFFSET*/
34003 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
34004 +       /*CFI_REL_OFFSET        rflags,EFLAGS-ARGOFFSET*/
34005 +       /*CFI_REL_OFFSET        cs,CS-ARGOFFSET*/
34006 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
34007 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
34008 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
34009 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
34010 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
34011 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
34012 +       CFI_REL_OFFSET  r8,R8-ARGOFFSET
34013 +       CFI_REL_OFFSET  r9,R9-ARGOFFSET
34014 +       CFI_REL_OFFSET  r10,R10-ARGOFFSET
34015 +       CFI_REL_OFFSET  r11,R11-ARGOFFSET
34016 +       XEN_BLOCK_EVENTS(%rsi)
34017 +       TRACE_IRQS_OFF
34018 +       testb $3,CS-ARGOFFSET(%rsp)
34019 +       jnz 1f
34020 +       /* Need to set the proper %ss (not NULL) for ring 3 iretq */
34021 +       movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
34022 +       jmp retint_restore_args   # retrun from ring3 kernel
34023 +1:
34024 +       movl $_TIF_ALLWORK_MASK,%edi
34025 +       /* edi: mask to check */
34026 +int_with_check:
34027 +       GET_THREAD_INFO(%rcx)
34028 +       movl threadinfo_flags(%rcx),%edx
34029 +       andl %edi,%edx
34030 +       jnz   int_careful
34031 +       andl    $~TS_COMPAT,threadinfo_status(%rcx)
34032 +       jmp   retint_restore_args
34033 +
34034 +       /* Either reschedule or signal or syscall exit tracking needed. */
34035 +       /* First do a reschedule test. */
34036 +       /* edx: work, edi: workmask */
34037 +int_careful:
34038 +       bt $TIF_NEED_RESCHED,%edx
34039 +       jnc  int_very_careful
34040 +       TRACE_IRQS_ON
34041 +       XEN_UNBLOCK_EVENTS(%rsi)
34042 +       pushq %rdi
34043 +       CFI_ADJUST_CFA_OFFSET 8
34044 +       call schedule
34045 +       popq %rdi
34046 +       CFI_ADJUST_CFA_OFFSET -8
34047 +       XEN_BLOCK_EVENTS(%rsi)
34048 +       TRACE_IRQS_OFF
34049 +       jmp int_with_check
34050 +
34051 +       /* handle signals and tracing -- both require a full stack frame */
34052 +int_very_careful:
34053 +       TRACE_IRQS_ON
34054 +       XEN_UNBLOCK_EVENTS(%rsi)
34055 +       SAVE_REST
34056 +       /* Check for syscall exit trace */      
34057 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
34058 +       jz int_signal
34059 +       pushq %rdi
34060 +       CFI_ADJUST_CFA_OFFSET 8
34061 +       leaq 8(%rsp),%rdi       # &ptregs -> arg1       
34062 +       call syscall_trace_leave
34063 +       popq %rdi
34064 +       CFI_ADJUST_CFA_OFFSET -8
34065 +       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
34066 +       XEN_BLOCK_EVENTS(%rsi)
34067 +       TRACE_IRQS_OFF
34068 +       jmp int_restore_rest
34069 +       
34070 +int_signal:
34071 +       testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
34072 +       jz 1f
34073 +       movq %rsp,%rdi          # &ptregs -> arg1
34074 +       xorl %esi,%esi          # oldset -> arg2
34075 +       call do_notify_resume
34076 +1:     movl $_TIF_NEED_RESCHED,%edi    
34077 +int_restore_rest:
34078 +       RESTORE_REST
34079 +       XEN_BLOCK_EVENTS(%rsi)
34080 +       TRACE_IRQS_OFF
34081 +       jmp int_with_check
34082 +       CFI_ENDPROC
34083 +END(int_ret_from_sys_call)
34084 +               
34085 +/* 
34086 + * Certain special system calls that need to save a complete full stack frame.
34087 + */                                                            
34088 +       
34089 +       .macro PTREGSCALL label,func,arg
34090 +       .globl \label
34091 +\label:
34092 +       leaq    \func(%rip),%rax
34093 +       leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
34094 +       jmp     ptregscall_common
34095 +END(\label)
34096 +       .endm
34097 +
34098 +       CFI_STARTPROC
34099 +
34100 +       PTREGSCALL stub_clone, sys_clone, %r8
34101 +       PTREGSCALL stub_fork, sys_fork, %rdi
34102 +       PTREGSCALL stub_vfork, sys_vfork, %rdi
34103 +       PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
34104 +       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
34105 +       PTREGSCALL stub_iopl, sys_iopl, %rsi
34106 +
34107 +ENTRY(ptregscall_common)
34108 +       popq %r11
34109 +       CFI_ADJUST_CFA_OFFSET -8
34110 +       CFI_REGISTER rip, r11
34111 +       SAVE_REST
34112 +       movq %r11, %r15
34113 +       CFI_REGISTER rip, r15
34114 +       FIXUP_TOP_OF_STACK %r11
34115 +       call *%rax
34116 +       RESTORE_TOP_OF_STACK %r11
34117 +       movq %r15, %r11
34118 +       CFI_REGISTER rip, r11
34119 +       RESTORE_REST
34120 +       pushq %r11
34121 +       CFI_ADJUST_CFA_OFFSET 8
34122 +       CFI_REL_OFFSET rip, 0
34123 +       ret
34124 +       CFI_ENDPROC
34125 +END(ptregscall_common)
34126 +       
34127 +ENTRY(stub_execve)
34128 +       CFI_STARTPROC
34129 +       popq %r11
34130 +       CFI_ADJUST_CFA_OFFSET -8
34131 +       CFI_REGISTER rip, r11
34132 +       SAVE_REST
34133 +       FIXUP_TOP_OF_STACK %r11
34134 +       call sys_execve
34135 +       RESTORE_TOP_OF_STACK %r11
34136 +       movq %rax,RAX(%rsp)
34137 +       RESTORE_REST
34138 +       jmp int_ret_from_sys_call
34139 +       CFI_ENDPROC
34140 +END(stub_execve)
34141 +       
34142 +/*
34143 + * sigreturn is special because it needs to restore all registers on return.
34144 + * This cannot be done with SYSRET, so use the IRET return path instead.
34145 + */                
34146 +ENTRY(stub_rt_sigreturn)
34147 +       CFI_STARTPROC
34148 +       addq $8, %rsp
34149 +       CFI_ADJUST_CFA_OFFSET   -8
34150 +       SAVE_REST
34151 +       movq %rsp,%rdi
34152 +       FIXUP_TOP_OF_STACK %r11
34153 +       call sys_rt_sigreturn
34154 +       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
34155 +       RESTORE_REST
34156 +       jmp int_ret_from_sys_call
34157 +       CFI_ENDPROC
34158 +END(stub_rt_sigreturn)
34159 +
34160 +/*
34161 + * initial frame state for interrupts and exceptions
34162 + */
34163 +       .macro _frame ref
34164 +       CFI_STARTPROC simple
34165 +       CFI_SIGNAL_FRAME
34166 +       CFI_DEF_CFA rsp,SS+8-\ref
34167 +       /*CFI_REL_OFFSET ss,SS-\ref*/
34168 +       CFI_REL_OFFSET rsp,RSP-\ref
34169 +       /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
34170 +       /*CFI_REL_OFFSET cs,CS-\ref*/
34171 +       CFI_REL_OFFSET rip,RIP-\ref
34172 +       .endm
34173 +
34174 +/* initial frame state for interrupts (and exceptions without error code) */
34175 +#define INTR_FRAME _frame RIP
34176 +/* initial frame state for exceptions with error code (and interrupts with
34177 +   vector already pushed) */
34178 +#define XCPT_FRAME _frame ORIG_RAX
34179 +
34180 +/* 
34181 + * Interrupt exit.
34182 + *
34183 + */ 
34184 +
34185 +retint_check:
34186 +       movl threadinfo_flags(%rcx),%edx
34187 +       andl %edi,%edx
34188 +       CFI_REMEMBER_STATE
34189 +       jnz  retint_careful
34190 +retint_restore_args:
34191 +       movl EFLAGS-REST_SKIP(%rsp), %eax
34192 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
34193 +       XEN_GET_VCPU_INFO(%rsi)
34194 +       andb evtchn_upcall_mask(%rsi),%al
34195 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
34196 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
34197 +       XEN_PUT_VCPU_INFO(%rsi)
34198 +       TRACE_IRQS_IRETQ
34199 +       RESTORE_ARGS 0,8,0
34200 +       HYPERVISOR_IRET 0
34201 +       
34202 +       /* edi: workmask, edx: work */
34203 +retint_careful:
34204 +       CFI_RESTORE_STATE
34205 +       bt    $TIF_NEED_RESCHED,%edx
34206 +       jnc   retint_signal
34207 +       TRACE_IRQS_ON
34208 +       XEN_UNBLOCK_EVENTS(%rsi)
34209 +       pushq %rdi
34210 +       CFI_ADJUST_CFA_OFFSET   8
34211 +       call  schedule
34212 +       popq %rdi               
34213 +       CFI_ADJUST_CFA_OFFSET   -8
34214 +       GET_THREAD_INFO(%rcx)
34215 +       XEN_BLOCK_EVENTS(%rsi)
34216 +       TRACE_IRQS_OFF
34217 +       jmp retint_check
34218 +       
34219 +retint_signal:
34220 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
34221 +       jz    retint_restore_args
34222 +       TRACE_IRQS_ON
34223 +       XEN_UNBLOCK_EVENTS(%rsi)
34224 +       SAVE_REST
34225 +       movq $-1,ORIG_RAX(%rsp)                         
34226 +       xorl %esi,%esi          # oldset
34227 +       movq %rsp,%rdi          # &pt_regs
34228 +       call do_notify_resume
34229 +       RESTORE_REST
34230 +       XEN_BLOCK_EVENTS(%rsi)
34231 +       TRACE_IRQS_OFF
34232 +       movl $_TIF_NEED_RESCHED,%edi
34233 +       GET_THREAD_INFO(%rcx)
34234 +       jmp retint_check
34235 +
34236 +#ifdef CONFIG_PREEMPT
34237 +       /* Returning to kernel space. Check if we need preemption */
34238 +       /* rcx:  threadinfo. interrupts off. */
34239 +ENTRY(retint_kernel)
34240 +       cmpl $0,threadinfo_preempt_count(%rcx)
34241 +       jnz  retint_restore_args
34242 +       bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
34243 +       jnc  retint_restore_args
34244 +       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
34245 +       jnc  retint_restore_args
34246 +       call preempt_schedule_irq
34247 +       jmp retint_kernel       /* check again */
34248 +#endif 
34249 +
34250 +       CFI_ENDPROC
34251 +END(common_interrupt)
34252 +       
34253 +/*
34254 + * APIC interrupts.
34255 + */            
34256 +       .macro apicinterrupt num,func
34257 +       INTR_FRAME
34258 +       pushq $~(\num)
34259 +       CFI_ADJUST_CFA_OFFSET 8
34260 +       interrupt \func
34261 +       jmp error_entry
34262 +       CFI_ENDPROC
34263 +       .endm
34264 +
34265 +#ifndef CONFIG_XEN
34266 +ENTRY(thermal_interrupt)
34267 +       apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
34268 +END(thermal_interrupt)
34269 +
34270 +ENTRY(threshold_interrupt)
34271 +       apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
34272 +END(threshold_interrupt)
34273 +
34274 +#ifdef CONFIG_SMP      
34275 +ENTRY(reschedule_interrupt)
34276 +       apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
34277 +END(reschedule_interrupt)
34278 +
34279 +       .macro INVALIDATE_ENTRY num
34280 +ENTRY(invalidate_interrupt\num)
34281 +       apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
34282 +END(invalidate_interrupt\num)
34283 +       .endm
34284 +
34285 +       INVALIDATE_ENTRY 0
34286 +       INVALIDATE_ENTRY 1
34287 +       INVALIDATE_ENTRY 2
34288 +       INVALIDATE_ENTRY 3
34289 +       INVALIDATE_ENTRY 4
34290 +       INVALIDATE_ENTRY 5
34291 +       INVALIDATE_ENTRY 6
34292 +       INVALIDATE_ENTRY 7
34293 +
34294 +ENTRY(call_function_interrupt)
34295 +       apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
34296 +END(call_function_interrupt)
34297 +#endif
34298 +
34299 +ENTRY(apic_timer_interrupt)
34300 +       apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
34301 +END(apic_timer_interrupt)
34302 +
34303 +ENTRY(error_interrupt)
34304 +       apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
34305 +END(error_interrupt)
34306 +
34307 +ENTRY(spurious_interrupt)
34308 +       apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
34309 +END(spurious_interrupt)
34310 +#endif /* !CONFIG_XEN */
34311 +                               
34312 +/*
34313 + * Exception entry points.
34314 + */            
34315 +       .macro zeroentry sym
34316 +       INTR_FRAME
34317 +       movq (%rsp),%rcx
34318 +       movq 8(%rsp),%r11
34319 +       addq $0x10,%rsp /* skip rcx and r11 */
34320 +       pushq $0        /* push error code/oldrax */ 
34321 +       CFI_ADJUST_CFA_OFFSET 8
34322 +       pushq %rax      /* push real oldrax to the rdi slot */ 
34323 +       CFI_ADJUST_CFA_OFFSET 8
34324 +       leaq  \sym(%rip),%rax
34325 +       jmp error_entry
34326 +       CFI_ENDPROC
34327 +       .endm   
34328 +
34329 +       .macro errorentry sym
34330 +       XCPT_FRAME
34331 +       movq (%rsp),%rcx
34332 +       movq 8(%rsp),%r11
34333 +       addq $0x10,%rsp /* rsp points to the error code */
34334 +       pushq %rax
34335 +       CFI_ADJUST_CFA_OFFSET 8
34336 +       leaq  \sym(%rip),%rax
34337 +       jmp error_entry
34338 +       CFI_ENDPROC
34339 +       .endm
34340 +
34341 +#if 0 /* not XEN */
34342 +       /* error code is on the stack already */
34343 +       /* handle NMI like exceptions that can happen everywhere */
34344 +       .macro paranoidentry sym, ist=0
34345 +       movq (%rsp),%rcx
34346 +       movq 8(%rsp),%r11
34347 +       addq $0x10,%rsp /* skip rcx and r11 */        
34348 +       SAVE_ALL
34349 +       cld
34350 +#if 0 /* not XEN */
34351 +       movl $1,%ebx
34352 +       movl  $MSR_GS_BASE,%ecx
34353 +       rdmsr
34354 +       testl %edx,%edx
34355 +       js    1f
34356 +       swapgs
34357 +       xorl  %ebx,%ebx
34358 +1:
34359 +#endif
34360 +       .if \ist
34361 +       movq    %gs:pda_data_offset, %rbp
34362 +       .endif
34363 +       movq %rsp,%rdi
34364 +       movq ORIG_RAX(%rsp),%rsi
34365 +       movq $-1,ORIG_RAX(%rsp)
34366 +       .if \ist
34367 +       subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
34368 +       .endif
34369 +       call \sym
34370 +       .if \ist
34371 +       addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
34372 +       .endif
34373 +       XEN_BLOCK_EVENTS(%rsi)          
34374 +       .if \irqtrace
34375 +       TRACE_IRQS_OFF
34376 +       .endif
34377 +       .endm
34378 +#endif
34379 +
34380 +/*
34381 + * Exception entry point. This expects an error code/orig_rax on the stack
34382 + * and the exception handler in %rax.  
34383 + */                                            
34384 +KPROBE_ENTRY(error_entry)
34385 +       _frame RDI
34386 +       /* rdi slot contains rax, oldrax contains error code */
34387 +       cld     
34388 +       subq  $14*8,%rsp
34389 +       CFI_ADJUST_CFA_OFFSET   (14*8)
34390 +       movq %rsi,13*8(%rsp)
34391 +       CFI_REL_OFFSET  rsi,RSI
34392 +       movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
34393 +       movq %rdx,12*8(%rsp)
34394 +       CFI_REL_OFFSET  rdx,RDX
34395 +       movq %rcx,11*8(%rsp)
34396 +       CFI_REL_OFFSET  rcx,RCX
34397 +       movq %rsi,10*8(%rsp)    /* store rax */ 
34398 +       CFI_REL_OFFSET  rax,RAX
34399 +       movq %r8, 9*8(%rsp)
34400 +       CFI_REL_OFFSET  r8,R8
34401 +       movq %r9, 8*8(%rsp)
34402 +       CFI_REL_OFFSET  r9,R9
34403 +       movq %r10,7*8(%rsp)
34404 +       CFI_REL_OFFSET  r10,R10
34405 +       movq %r11,6*8(%rsp)
34406 +       CFI_REL_OFFSET  r11,R11
34407 +       movq %rbx,5*8(%rsp) 
34408 +       CFI_REL_OFFSET  rbx,RBX
34409 +       movq %rbp,4*8(%rsp) 
34410 +       CFI_REL_OFFSET  rbp,RBP
34411 +       movq %r12,3*8(%rsp) 
34412 +       CFI_REL_OFFSET  r12,R12
34413 +       movq %r13,2*8(%rsp) 
34414 +       CFI_REL_OFFSET  r13,R13
34415 +       movq %r14,1*8(%rsp) 
34416 +       CFI_REL_OFFSET  r14,R14
34417 +       movq %r15,(%rsp) 
34418 +       CFI_REL_OFFSET  r15,R15
34419 +#if 0
34420 +       cmpl $__KERNEL_CS,CS(%rsp)
34421 +       je  error_kernelspace
34422 +#endif
34423 +error_call_handler:
34424 +       movq %rdi,RDI(%rsp)     
34425 +       movq %rsp,%rdi
34426 +       movq ORIG_RAX(%rsp),%rsi        /* get error code */ 
34427 +       movq $-1,ORIG_RAX(%rsp)
34428 +       call *%rax
34429 +error_exit:            
34430 +       RESTORE_REST
34431 +       XEN_BLOCK_EVENTS(%rsi)
34432 +       TRACE_IRQS_OFF
34433 +       GET_THREAD_INFO(%rcx)   
34434 +       testb $3,CS-ARGOFFSET(%rsp)
34435 +       jz retint_kernel
34436 +       movl  threadinfo_flags(%rcx),%edx
34437 +       movl  $_TIF_WORK_MASK,%edi
34438 +       andl  %edi,%edx
34439 +       jnz   retint_careful
34440 +       jmp   retint_restore_args
34441 +
34442 +error_kernelspace:
34443 +       /*
34444 +        * We need to re-write the logic here because we don't do iretq to
34445 +        * to return to user mode. It's still possible that we get trap/fault
34446 +        * in the kernel (when accessing buffers pointed to by system calls,
34447 +        * for example).
34448 +        *
34449 +        */
34450 +#if 0
34451 +       incl %ebx
34452 +       /* There are two places in the kernel that can potentially fault with
34453 +          usergs. Handle them here. The exception handlers after
34454 +          iret run with kernel gs again, so don't set the user space flag.
34455 +          B stepping K8s sometimes report an truncated RIP for IRET 
34456 +          exceptions returning to compat mode. Check for these here too. */
34457 +       leaq iret_label(%rip),%rbp
34458 +       cmpq %rbp,RIP(%rsp) 
34459 +       je   error_swapgs
34460 +       movl %ebp,%ebp  /* zero extend */
34461 +       cmpq %rbp,RIP(%rsp) 
34462 +       je   error_swapgs
34463 +       cmpq $gs_change,RIP(%rsp)
34464 +        je   error_swapgs
34465 +       jmp  error_sti
34466 +#endif
34467 +KPROBE_END(error_entry)
34468 +
34469 +ENTRY(hypervisor_callback)
34470 +       zeroentry do_hypervisor_callback
34471 +        
34472 +/*
34473 + * Copied from arch/xen/i386/kernel/entry.S
34474 + */               
34475 +# A note on the "critical region" in our callback handler.
34476 +# We want to avoid stacking callback handlers due to events occurring
34477 +# during handling of the last event. To do this, we keep events disabled
34478 +# until we've done all processing. HOWEVER, we must enable events before
34479 +# popping the stack frame (can't be done atomically) and so it would still
34480 +# be possible to get enough handler activations to overflow the stack.
34481 +# Although unlikely, bugs of that kind are hard to track down, so we'd
34482 +# like to avoid the possibility.
34483 +# So, on entry to the handler we detect whether we interrupted an
34484 +# existing activation in its critical region -- if so, we pop the current
34485 +# activation and restart the handler using the previous one.
34486 +ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
34487 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
34488 +# see the correct pointer to the pt_regs
34489 +       movq %rdi, %rsp            # we don't return, adjust the stack frame
34490 +11:    movq %gs:pda_irqstackptr,%rax
34491 +       incl %gs:pda_irqcount
34492 +       cmovzq %rax,%rsp
34493 +       pushq %rdi
34494 +       call evtchn_do_upcall
34495 +       popq %rsp
34496 +       decl %gs:pda_irqcount
34497 +       jmp  error_exit
34498 +
34499 +KPROBE_ENTRY(nmi)
34500 +       zeroentry do_nmi_callback
34501 +ENTRY(xen_do_nmi_callback)
34502 +       addq $8, %rsp
34503 +       call do_nmi
34504 +       orl  $NMI_MASK,EFLAGS(%rsp)
34505 +       RESTORE_REST
34506 +       XEN_BLOCK_EVENTS(%rsi)
34507 +       GET_THREAD_INFO(%rcx)
34508 +       jmp  retint_restore_args
34509 +       .previous .text
34510 +
34511 +        ALIGN
34512 +restore_all_enable_events:  
34513 +       XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
34514 +
34515 +scrit: /**** START OF CRITICAL REGION ****/
34516 +       XEN_TEST_PENDING(%rsi)
34517 +       jnz  14f                        # process more events if necessary...
34518 +       XEN_PUT_VCPU_INFO(%rsi)
34519 +        RESTORE_ARGS 0,8,0
34520 +        HYPERVISOR_IRET 0
34521 +
34522 +14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
34523 +       XEN_PUT_VCPU_INFO(%rsi)
34524 +       SAVE_REST
34525 +       movq %rsp,%rdi                  # set the argument again
34526 +       jmp  11b
34527 +ecrit:  /**** END OF CRITICAL REGION ****/
34528 +# At this point, unlike on x86-32, we don't do the fixup to simplify the 
34529 +# code and the stack frame is more complex on x86-64.
34530 +# When the kernel is interrupted in the critical section, the kernel 
34531 +# will do IRET in that case, and everything will be restored at that point, 
34532 +# i.e. it just resumes from the next instruction interrupted with the same context. 
34533 +
34534 +# Hypervisor uses this for application faults while it executes.
34535 +# We get here for two reasons:
34536 +#  1. Fault while reloading DS, ES, FS or GS
34537 +#  2. Fault while executing IRET
34538 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
34539 +# registers that could be reloaded and zeroed the others.
34540 +# Category 2 we fix up by killing the current process. We cannot use the
34541 +# normal Linux return path in this case because if we use the IRET hypercall
34542 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
34543 +# We distinguish between categories by comparing each saved segment register
34544 +# with its current contents: any discrepancy means we in category 1.
34545 +ENTRY(failsafe_callback)
34546 +       movw %ds,%cx
34547 +       cmpw %cx,0x10(%rsp)
34548 +       jne 1f
34549 +       movw %es,%cx
34550 +       cmpw %cx,0x18(%rsp)
34551 +       jne 1f
34552 +       movw %fs,%cx
34553 +       cmpw %cx,0x20(%rsp)
34554 +       jne 1f
34555 +       movw %gs,%cx
34556 +       cmpw %cx,0x28(%rsp)
34557 +       jne 1f
34558 +       /* All segments match their saved values => Category 2 (Bad IRET). */
34559 +       movq (%rsp),%rcx
34560 +       movq 8(%rsp),%r11
34561 +       addq $0x30,%rsp
34562 +       movq $-9999,%rdi        /* better code? */
34563 +       jmp do_exit                     
34564 +1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
34565 +       movq (%rsp),%rcx
34566 +       movq 8(%rsp),%r11
34567 +       addq $0x30,%rsp
34568 +       pushq $0
34569 +       SAVE_ALL
34570 +       jmp error_exit
34571 +#if 0        
34572 +        .section __ex_table,"a"
34573 +        .align 8
34574 +        .quad gs_change,bad_gs
34575 +        .previous
34576 +        .section .fixup,"ax"
34577 +       /* running with kernelgs */
34578 +bad_gs: 
34579 +/*     swapgs          */      /* switch back to user gs */
34580 +       xorl %eax,%eax
34581 +        movl %eax,%gs
34582 +        jmp  2b
34583 +        .previous       
34584 +#endif
34585 +       
34586 +/*
34587 + * Create a kernel thread.
34588 + *
34589 + * C extern interface:
34590 + *     extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
34591 + *
34592 + * asm input arguments:
34593 + *     rdi: fn, rsi: arg, rdx: flags
34594 + */
34595 +ENTRY(kernel_thread)
34596 +       CFI_STARTPROC
34597 +       FAKE_STACK_FRAME $child_rip
34598 +       SAVE_ALL
34599 +
34600 +       # rdi: flags, rsi: usp, rdx: will be &pt_regs
34601 +       movq %rdx,%rdi
34602 +       orq  kernel_thread_flags(%rip),%rdi
34603 +       movq $-1, %rsi
34604 +       movq %rsp, %rdx
34605 +
34606 +       xorl %r8d,%r8d
34607 +       xorl %r9d,%r9d
34608 +       
34609 +       # clone now
34610 +       call do_fork
34611 +       movq %rax,RAX(%rsp)
34612 +       xorl %edi,%edi
34613 +
34614 +       /*
34615 +        * It isn't worth to check for reschedule here,
34616 +        * so internally to the x86_64 port you can rely on kernel_thread()
34617 +        * not to reschedule the child before returning, this avoids the need
34618 +        * of hacks for example to fork off the per-CPU idle tasks.
34619 +         * [Hopefully no generic code relies on the reschedule -AK]    
34620 +        */
34621 +       RESTORE_ALL
34622 +       UNFAKE_STACK_FRAME
34623 +       ret
34624 +       CFI_ENDPROC
34625 +ENDPROC(kernel_thread)
34626 +       
34627 +child_rip:
34628 +       pushq $0                # fake return address
34629 +       CFI_STARTPROC
34630 +       /*
34631 +        * Here we are in the child and the registers are set as they were
34632 +        * at kernel_thread() invocation in the parent.
34633 +        */
34634 +       movq %rdi, %rax
34635 +       movq %rsi, %rdi
34636 +       call *%rax
34637 +       # exit
34638 +       xorl %edi, %edi
34639 +       call do_exit
34640 +       CFI_ENDPROC
34641 +ENDPROC(child_rip)
34642 +
34643 +/*
34644 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
34645 + *
34646 + * C extern interface:
34647 + *      extern long execve(char *name, char **argv, char **envp)
34648 + *
34649 + * asm input arguments:
34650 + *     rdi: name, rsi: argv, rdx: envp
34651 + *
34652 + * We want to fallback into:
34653 + *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
34654 + *
34655 + * do_sys_execve asm fallback arguments:
34656 + *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
34657 + */
34658 +ENTRY(kernel_execve)
34659 +       CFI_STARTPROC
34660 +       FAKE_STACK_FRAME $0
34661 +       SAVE_ALL        
34662 +       call sys_execve
34663 +       movq %rax, RAX(%rsp)    
34664 +       RESTORE_REST
34665 +       testq %rax,%rax
34666 +       jne 1f
34667 +       jmp int_ret_from_sys_call
34668 +1:     RESTORE_ARGS
34669 +       UNFAKE_STACK_FRAME
34670 +       ret
34671 +       CFI_ENDPROC
34672 +ENDPROC(kernel_execve)
34673 +
34674 +KPROBE_ENTRY(page_fault)
34675 +       errorentry do_page_fault
34676 +KPROBE_END(page_fault)
34677 +
34678 +ENTRY(coprocessor_error)
34679 +       zeroentry do_coprocessor_error
34680 +END(coprocessor_error)
34681 +
34682 +ENTRY(simd_coprocessor_error)
34683 +       zeroentry do_simd_coprocessor_error     
34684 +END(simd_coprocessor_error)
34685 +
34686 +ENTRY(device_not_available)
34687 +       zeroentry math_state_restore
34688 +END(device_not_available)
34689 +
34690 +       /* runs on exception stack */
34691 +KPROBE_ENTRY(debug)
34692 +       INTR_FRAME
34693 +/*     pushq $0
34694 +       CFI_ADJUST_CFA_OFFSET 8 */
34695 +       zeroentry do_debug
34696 +/*     jmp paranoid_exit */
34697 +       CFI_ENDPROC
34698 +KPROBE_END(debug)
34699 +       .previous .text
34700 +
34701 +#if 0
34702 +       /* runs on exception stack */   
34703 +KPROBE_ENTRY(nmi)
34704 +       INTR_FRAME
34705 +       pushq $-1
34706 +       CFI_ADJUST_CFA_OFFSET 8
34707 +       paranoidentry do_nmi, 0, 0
34708 +#ifdef CONFIG_TRACE_IRQFLAGS
34709 +       paranoidexit 0
34710 +#else
34711 +       jmp paranoid_exit1
34712 +       CFI_ENDPROC
34713 +#endif
34714 +KPROBE_END(nmi)
34715 +       .previous .text
34716 +#endif        
34717 +
34718 +KPROBE_ENTRY(int3)
34719 +       INTR_FRAME
34720 +/*     pushq $0
34721 +       CFI_ADJUST_CFA_OFFSET 8 */
34722 +       zeroentry do_int3
34723 +/*     jmp paranoid_exit */
34724 +       CFI_ENDPROC
34725 +KPROBE_END(int3)
34726 +
34727 +ENTRY(overflow)
34728 +       zeroentry do_overflow
34729 +END(overflow)
34730 +
34731 +ENTRY(bounds)
34732 +       zeroentry do_bounds
34733 +END(bounds)
34734 +
34735 +ENTRY(invalid_op)
34736 +       zeroentry do_invalid_op 
34737 +END(invalid_op)
34738 +
34739 +ENTRY(coprocessor_segment_overrun)
34740 +       zeroentry do_coprocessor_segment_overrun
34741 +END(coprocessor_segment_overrun)
34742 +
34743 +ENTRY(reserved)
34744 +       zeroentry do_reserved
34745 +END(reserved)
34746 +
34747 +#if 0
34748 +       /* runs on exception stack */
34749 +ENTRY(double_fault)
34750 +       XCPT_FRAME
34751 +       paranoidentry do_double_fault
34752 +       jmp paranoid_exit1
34753 +       CFI_ENDPROC
34754 +END(double_fault)
34755 +#endif
34756 +
34757 +ENTRY(invalid_TSS)
34758 +       errorentry do_invalid_TSS
34759 +END(invalid_TSS)
34760 +
34761 +ENTRY(segment_not_present)
34762 +       errorentry do_segment_not_present
34763 +END(segment_not_present)
34764 +
34765 +       /* runs on exception stack */
34766 +ENTRY(stack_segment)
34767 +       XCPT_FRAME
34768 +       errorentry do_stack_segment
34769 +       CFI_ENDPROC
34770 +END(stack_segment)
34771 +
34772 +KPROBE_ENTRY(general_protection)
34773 +       errorentry do_general_protection
34774 +KPROBE_END(general_protection)
34775 +
34776 +ENTRY(alignment_check)
34777 +       errorentry do_alignment_check
34778 +END(alignment_check)
34779 +
34780 +ENTRY(divide_error)
34781 +       zeroentry do_divide_error
34782 +END(divide_error)
34783 +
34784 +ENTRY(spurious_interrupt_bug)
34785 +       zeroentry do_spurious_interrupt_bug
34786 +END(spurious_interrupt_bug)
34787 +
34788 +#ifdef CONFIG_X86_MCE
34789 +       /* runs on exception stack */
34790 +ENTRY(machine_check)
34791 +       INTR_FRAME
34792 +       pushq $0
34793 +       CFI_ADJUST_CFA_OFFSET 8 
34794 +       paranoidentry do_machine_check
34795 +       jmp paranoid_exit1
34796 +       CFI_ENDPROC
34797 +END(machine_check)
34798 +#endif
34799 +
34800 +/* Call softirq on interrupt stack. Interrupts are off. */
34801 +ENTRY(call_softirq)
34802 +       CFI_STARTPROC
34803 +       push %rbp
34804 +       CFI_ADJUST_CFA_OFFSET   8
34805 +       CFI_REL_OFFSET rbp,0
34806 +       mov  %rsp,%rbp
34807 +       CFI_DEF_CFA_REGISTER rbp
34808 +       incl %gs:pda_irqcount
34809 +       cmove %gs:pda_irqstackptr,%rsp
34810 +       push  %rbp                      # backlink for old unwinder
34811 +       call __do_softirq
34812 +       leaveq
34813 +       CFI_DEF_CFA_REGISTER    rsp
34814 +       CFI_ADJUST_CFA_OFFSET   -8
34815 +       decl %gs:pda_irqcount
34816 +       ret
34817 +       CFI_ENDPROC
34818 +ENDPROC(call_softirq)
34819 +
34820 +#ifdef CONFIG_STACK_UNWIND
34821 +ENTRY(arch_unwind_init_running)
34822 +       CFI_STARTPROC
34823 +       movq    %r15, R15(%rdi)
34824 +       movq    %r14, R14(%rdi)
34825 +       xchgq   %rsi, %rdx
34826 +       movq    %r13, R13(%rdi)
34827 +       movq    %r12, R12(%rdi)
34828 +       xorl    %eax, %eax
34829 +       movq    %rbp, RBP(%rdi)
34830 +       movq    %rbx, RBX(%rdi)
34831 +       movq    (%rsp), %rcx
34832 +       movq    %rax, R11(%rdi)
34833 +       movq    %rax, R10(%rdi)
34834 +       movq    %rax, R9(%rdi)
34835 +       movq    %rax, R8(%rdi)
34836 +       movq    %rax, RAX(%rdi)
34837 +       movq    %rax, RCX(%rdi)
34838 +       movq    %rax, RDX(%rdi)
34839 +       movq    %rax, RSI(%rdi)
34840 +       movq    %rax, RDI(%rdi)
34841 +       movq    %rax, ORIG_RAX(%rdi)
34842 +       movq    %rcx, RIP(%rdi)
34843 +       leaq    8(%rsp), %rcx
34844 +       movq    $__KERNEL_CS, CS(%rdi)
34845 +       movq    %rax, EFLAGS(%rdi)
34846 +       movq    %rcx, RSP(%rdi)
34847 +       movq    $__KERNEL_DS, SS(%rdi)
34848 +       jmpq    *%rdx
34849 +       CFI_ENDPROC
34850 +ENDPROC(arch_unwind_init_running)
34851 +#endif
34852 diff -ruNp linux-2.6.19/arch/x86_64/kernel/genapic-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/genapic-xen.c
34853 --- linux-2.6.19/arch/x86_64/kernel/genapic-xen.c       1970-01-01 00:00:00.000000000 +0000
34854 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/genapic-xen.c     2007-02-02 19:10:26.000000000 +0000
34855 @@ -0,0 +1,143 @@
34856 +/*
34857 + * Copyright 2004 James Cleverdon, IBM.
34858 + * Subject to the GNU Public License, v.2
34859 + *
34860 + * Generic APIC sub-arch probe layer.
34861 + *
34862 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
34863 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
34864 + * James Cleverdon.
34865 + */
34866 +#include <linux/threads.h>
34867 +#include <linux/cpumask.h>
34868 +#include <linux/string.h>
34869 +#include <linux/kernel.h>
34870 +#include <linux/ctype.h>
34871 +#include <linux/init.h>
34872 +#include <linux/module.h>
34873 +
34874 +#include <asm/smp.h>
34875 +#include <asm/ipi.h>
34876 +
34877 +#if defined(CONFIG_ACPI)
34878 +#include <acpi/acpi_bus.h>
34879 +#endif
34880 +
34881 +/* which logical CPU number maps to which CPU (physical APIC ID) */
34882 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
34883 +EXPORT_SYMBOL(x86_cpu_to_apicid);
34884 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
34885 +
34886 +extern struct genapic apic_cluster;
34887 +extern struct genapic apic_flat;
34888 +extern struct genapic apic_physflat;
34889 +
34890 +#ifndef CONFIG_XEN
34891 +struct genapic *genapic = &apic_flat;
34892 +#else
34893 +extern struct genapic apic_xen;
34894 +struct genapic *genapic = &apic_xen;
34895 +#endif
34896 +
34897 +
34898 +/*
34899 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
34900 + */
34901 +void __init clustered_apic_check(void)
34902 +{
34903 +#ifndef CONFIG_XEN
34904 +       long i;
34905 +       u8 clusters, max_cluster;
34906 +       u8 id;
34907 +       u8 cluster_cnt[NUM_APIC_CLUSTERS];
34908 +       int max_apic = 0;
34909 +
34910 +#if defined(CONFIG_ACPI)
34911 +       /*
34912 +        * Some x86_64 machines use physical APIC mode regardless of how many
34913 +        * procs/clusters are present (x86_64 ES7000 is an example).
34914 +        */
34915 +       if (acpi_fadt.revision > FADT2_REVISION_ID)
34916 +               if (acpi_fadt.force_apic_physical_destination_mode) {
34917 +                       genapic = &apic_cluster;
34918 +                       goto print;
34919 +               }
34920 +#endif
34921 +
34922 +       memset(cluster_cnt, 0, sizeof(cluster_cnt));
34923 +       for (i = 0; i < NR_CPUS; i++) {
34924 +               id = bios_cpu_apicid[i];
34925 +               if (id == BAD_APICID)
34926 +                       continue;
34927 +               if (id > max_apic)
34928 +                       max_apic = id;
34929 +               cluster_cnt[APIC_CLUSTERID(id)]++;
34930 +       }
34931 +
34932 +       /* Don't use clustered mode on AMD platforms. */
34933 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
34934 +               genapic = &apic_physflat;
34935 +#ifndef CONFIG_HOTPLUG_CPU
34936 +               /* In the CPU hotplug case we cannot use broadcast mode
34937 +                  because that opens a race when a CPU is removed.
34938 +                  Stay at physflat mode in this case.
34939 +                  It is bad to do this unconditionally though. Once
34940 +                  we have ACPI platform support for CPU hotplug
34941 +                  we should detect hotplug capablity from ACPI tables and
34942 +                  only do this when really needed. -AK */
34943 +               if (max_apic <= 8)
34944 +                       genapic = &apic_flat;
34945 +#endif
34946 +               goto print;
34947 +       }
34948 +
34949 +       clusters = 0;
34950 +       max_cluster = 0;
34951 +
34952 +       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
34953 +               if (cluster_cnt[i] > 0) {
34954 +                       ++clusters;
34955 +                       if (cluster_cnt[i] > max_cluster)
34956 +                               max_cluster = cluster_cnt[i];
34957 +               }
34958 +       }
34959 +
34960 +       /*
34961 +        * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
34962 +        * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
34963 +        * else physical mode.
34964 +        * (We don't use lowest priority delivery + HW APIC IRQ steering, so
34965 +        * can ignore the clustered logical case and go straight to physical.)
34966 +        */
34967 +       if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
34968 +#ifdef CONFIG_HOTPLUG_CPU
34969 +               /* Don't use APIC shortcuts in CPU hotplug to avoid races */
34970 +               genapic = &apic_physflat;
34971 +#else
34972 +               genapic = &apic_flat;
34973 +#endif
34974 +       } else
34975 +               genapic = &apic_cluster;
34976 +
34977 +print:
34978 +#else
34979 +       /* hardcode to xen apic functions */
34980 +       genapic = &apic_xen;
34981 +#endif
34982 +       printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
34983 +}
34984 +
34985 +/* Same for both flat and clustered. */
34986 +
34987 +#ifdef CONFIG_XEN
34988 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
34989 +#endif
34990 +
34991 +void send_IPI_self(int vector)
34992 +{
34993 +#ifndef CONFIG_XEN
34994 +       __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
34995 +#else
34996 +       xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
34997 +#endif
34998 +}
34999 diff -ruNp linux-2.6.19/arch/x86_64/kernel/genapic_xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/genapic_xen.c
35000 --- linux-2.6.19/arch/x86_64/kernel/genapic_xen.c       1970-01-01 00:00:00.000000000 +0000
35001 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/genapic_xen.c     2007-02-02 19:10:26.000000000 +0000
35002 @@ -0,0 +1,190 @@
35003 +/*
35004 + * Copyright 2004 James Cleverdon, IBM.
35005 + * Subject to the GNU Public License, v.2
35006 + *
35007 + * Xen APIC subarch code.  Maximum 8 CPUs, logical delivery.
35008 + *
35009 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
35010 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
35011 + * James Cleverdon.
35012 + *
35013 + * Hacked to pieces for Xen by Chris Wright.
35014 + */
35015 +#include <linux/threads.h>
35016 +#include <linux/cpumask.h>
35017 +#include <linux/string.h>
35018 +#include <linux/kernel.h>
35019 +#include <linux/ctype.h>
35020 +#include <linux/init.h>
35021 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
35022 +#include <asm/smp.h>
35023 +#include <asm/ipi.h>
35024 +#else
35025 +#include <asm/apic.h>
35026 +#include <asm/apicdef.h>
35027 +#include <asm/genapic.h>
35028 +#endif
35029 +#include <xen/evtchn.h>
35030 +
35031 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
35032 +
35033 +static inline void __send_IPI_one(unsigned int cpu, int vector)
35034 +{
35035 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
35036 +       BUG_ON(irq < 0);
35037 +       notify_remote_via_irq(irq);
35038 +}
35039 +
35040 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
35041 +{
35042 +       int cpu;
35043 +
35044 +       switch (shortcut) {
35045 +       case APIC_DEST_SELF:
35046 +               __send_IPI_one(smp_processor_id(), vector);
35047 +               break;
35048 +       case APIC_DEST_ALLBUT:
35049 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
35050 +                       if (cpu == smp_processor_id())
35051 +                               continue;
35052 +                       if (cpu_isset(cpu, cpu_online_map)) {
35053 +                               __send_IPI_one(cpu, vector);
35054 +                       }
35055 +               }
35056 +               break;
35057 +       case APIC_DEST_ALLINC:
35058 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
35059 +                       if (cpu_isset(cpu, cpu_online_map)) {
35060 +                               __send_IPI_one(cpu, vector);
35061 +                       }
35062 +               }
35063 +               break;
35064 +       default:
35065 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
35066 +                      vector);
35067 +               break;
35068 +       }
35069 +}
35070 +
35071 +static cpumask_t xen_target_cpus(void)
35072 +{
35073 +       return cpu_online_map;
35074 +}
35075 +
35076 +static cpumask_t xen_vector_allocation_domain(int cpu)
35077 +{
35078 +       /* Careful. Some cpus do not strictly honor the set of cpus
35079 +        * specified in the interrupt destination when using lowest
35080 +        * priority interrupt delivery mode.
35081 +        *
35082 +        * In particular there was a hyperthreading cpu observed to
35083 +        * deliver interrupts to the wrong hyperthread when only one
35084 +        * hyperthread was specified in the interrupt desitination.
35085 +        */
35086 +       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
35087 +       return domain;
35088 +}
35089 +
35090 +/*
35091 + * Set up the logical destination ID.
35092 + * Do nothing, not called now.
35093 + */
35094 +static void xen_init_apic_ldr(void)
35095 +{
35096 +       Dprintk("%s\n", __FUNCTION__);
35097 +       return;
35098 +}
35099 +
35100 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
35101 +{
35102 +       unsigned long mask = cpus_addr(cpumask)[0];
35103 +       unsigned int cpu;
35104 +       unsigned long flags;
35105 +
35106 +       Dprintk("%s\n", __FUNCTION__);
35107 +       local_irq_save(flags);
35108 +       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
35109 +
35110 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
35111 +               if (cpu_isset(cpu, cpumask)) {
35112 +                       __send_IPI_one(cpu, vector);
35113 +               }
35114 +       }
35115 +       local_irq_restore(flags);
35116 +}
35117 +
35118 +static void xen_send_IPI_allbutself(int vector)
35119 +{
35120 +#ifdef CONFIG_HOTPLUG_CPU
35121 +       int hotplug = 1;
35122 +#else
35123 +       int hotplug = 0;
35124 +#endif
35125 +       /*
35126 +        * if there are no other CPUs in the system then
35127 +        * we get an APIC send error if we try to broadcast.
35128 +        * thus we have to avoid sending IPIs in this case.
35129 +        */
35130 +       Dprintk("%s\n", __FUNCTION__);
35131 +       if (hotplug || vector == NMI_VECTOR) {
35132 +               cpumask_t allbutme = cpu_online_map;
35133 +
35134 +               cpu_clear(smp_processor_id(), allbutme);
35135 +
35136 +               if (!cpus_empty(allbutme))
35137 +                       xen_send_IPI_mask(allbutme, vector);
35138 +       } else if (num_online_cpus() > 1) {
35139 +               xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
35140 +       }
35141 +}
35142 +
35143 +static void xen_send_IPI_all(int vector)
35144 +{
35145 +       Dprintk("%s\n", __FUNCTION__);
35146 +       if (vector == NMI_VECTOR)
35147 +               xen_send_IPI_mask(cpu_online_map, vector);
35148 +       else
35149 +               xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
35150 +}
35151 +
35152 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
35153 +static int xen_apic_id_registered(void)
35154 +{
35155 +       /* better be set */
35156 +       Dprintk("%s\n", __FUNCTION__);
35157 +       return physid_isset(smp_processor_id(), phys_cpu_present_map);
35158 +}
35159 +#endif
35160 +
35161 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
35162 +{
35163 +       Dprintk("%s\n", __FUNCTION__);
35164 +       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
35165 +}
35166 +
35167 +static unsigned int phys_pkg_id(int index_msb)
35168 +{
35169 +       int ebx;
35170 +       Dprintk("%s\n", __FUNCTION__);
35171 +       ebx = cpuid_ebx(1);
35172 +       return ((ebx >> 24) & 0xFF) >> index_msb;
35173 +}
35174 +
35175 +struct genapic apic_xen =  {
35176 +       .name = "xen",
35177 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
35178 +       .int_delivery_mode = dest_LowestPrio,
35179 +#endif
35180 +       .int_dest_mode = (APIC_DEST_LOGICAL != 0),
35181 +       .target_cpus = xen_target_cpus,
35182 +       .vector_allocation_domain = xen_vector_allocation_domain,
35183 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
35184 +       .apic_id_registered = xen_apic_id_registered,
35185 +#endif
35186 +       .init_apic_ldr = xen_init_apic_ldr,
35187 +       .send_IPI_all = xen_send_IPI_all,
35188 +       .send_IPI_allbutself = xen_send_IPI_allbutself,
35189 +       .send_IPI_mask = xen_send_IPI_mask,
35190 +       .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
35191 +       .phys_pkg_id = phys_pkg_id,
35192 +};
35193 diff -ruNp linux-2.6.19/arch/x86_64/kernel/head-xen.S linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/head-xen.S
35194 --- linux-2.6.19/arch/x86_64/kernel/head-xen.S  1970-01-01 00:00:00.000000000 +0000
35195 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/head-xen.S        2007-02-02 19:10:26.000000000 +0000
35196 @@ -0,0 +1,189 @@
35197 +/*
35198 + *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
35199 + *
35200 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
35201 + *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
35202 + *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
35203 + *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
35204 + *
35205 + *  Jun Nakajima <jun.nakajima@intel.com>
35206 + *    Modified for Xen                                
35207 + */
35208 +
35209 +
35210 +#include <linux/linkage.h>
35211 +#include <linux/threads.h>
35212 +#include <linux/init.h>
35213 +#include <linux/elfnote.h>
35214 +#include <asm/desc.h>
35215 +#include <asm/segment.h>
35216 +#include <asm/page.h>
35217 +#include <asm/msr.h>
35218 +#include <asm/cache.h>
35219 +
35220 +#include <xen/interface/elfnote.h>
35221 +
35222 +       .text
35223 +       .section .bootstrap.text
35224 +       .code64
35225 +#define VIRT_ENTRY_OFFSET 0x0
35226 +.org VIRT_ENTRY_OFFSET
35227 +       .globl startup_64
35228 +startup_64:
35229 +ENTRY(_start)
35230 +       movq $(init_thread_union+THREAD_SIZE-8),%rsp
35231 +       /* zero EFLAGS after setting rsp */
35232 +       pushq $0
35233 +       popfq
35234 +
35235 +       /* rsi is pointer to startup info structure.
35236 +          pass it to C */
35237 +       movq %rsi,%rdi
35238 +       jmp x86_64_start_kernel
35239 +
35240 +ENTRY(stext)
35241 +ENTRY(_stext)
35242 +
35243 +       $page = 0
35244 +#define NEXT_PAGE(name) \
35245 +       $page = $page + 1; \
35246 +       .org $page * 0x1000; \
35247 +       phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
35248 +ENTRY(name)
35249 +
35250 +NEXT_PAGE(init_level4_pgt)
35251 +       /* This gets initialized in x86_64_start_kernel */
35252 +       .fill   512,8,0
35253 +
35254 +        /*
35255 +         * We update two pgd entries to make kernel and user pgd consistent
35256 +         * at pgd_populate(). It can be used for kernel modules. So we place 
35257 +         * this page here for those cases to avoid memory corruption.
35258 +         * We also use this page to establish the initiali mapping for
35259 +         * vsyscall area.
35260 +         */
35261 +NEXT_PAGE(init_level4_user_pgt)
35262 +       .fill   512,8,0
35263 +
35264 +NEXT_PAGE(level3_kernel_pgt)
35265 +       .fill   512,8,0
35266 +
35267 +        /*
35268 +         * This is used for vsyscall area mapping as we have a different
35269 +         * level4 page table for user.
35270 +         */
35271 +NEXT_PAGE(level3_user_pgt)
35272 +        .fill  512,8,0
35273 +
35274 +NEXT_PAGE(level2_kernel_pgt)
35275 +       .fill   512,8,0
35276 +
35277 +NEXT_PAGE(hypercall_page)
35278 +       .fill   512,8,0
35279 +
35280 +#undef NEXT_PAGE
35281 +
35282 +       .data
35283 +
35284 +       .align 16
35285 +       .globl cpu_gdt_descr
35286 +cpu_gdt_descr:
35287 +       .word   gdt_end-cpu_gdt_table-1
35288 +gdt:
35289 +       .quad   cpu_gdt_table
35290 +#ifdef CONFIG_SMP
35291 +       .rept   NR_CPUS-1
35292 +       .word   0
35293 +       .quad   0
35294 +       .endr
35295 +#endif
35296 +
35297 +/* We need valid kernel segments for data and code in long mode too
35298 + * IRET will check the segment types  kkeil 2000/10/28
35299 + * Also sysret mandates a special GDT layout 
35300 + */
35301 +                               
35302 +       .section .data.page_aligned, "aw"
35303 +       .align PAGE_SIZE
35304 +
35305 +/* The TLS descriptors are currently at a different place compared to i386.
35306 +   Hopefully nobody expects them at a fixed place (Wine?) */
35307 +       
35308 +ENTRY(cpu_gdt_table)
35309 +       .quad   0x0000000000000000      /* NULL descriptor */
35310 +       .quad   0x0                     /* unused */
35311 +       .quad   0x00af9a000000ffff      /* __KERNEL_CS */
35312 +       .quad   0x00cf92000000ffff      /* __KERNEL_DS */
35313 +       .quad   0x00cffa000000ffff      /* __USER32_CS */
35314 +       .quad   0x00cff2000000ffff      /* __USER_DS, __USER32_DS  */           
35315 +       .quad   0x00affa000000ffff      /* __USER_CS */
35316 +       .quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
35317 +       .quad   0,0                     /* TSS */
35318 +       .quad   0,0                     /* LDT */
35319 +       .quad   0,0,0                   /* three TLS descriptors */ 
35320 +       .quad   0x0000f40000000000      /* node/CPU stored in limit */
35321 +gdt_end:       
35322 +       /* asm/segment.h:GDT_ENTRIES must match this */ 
35323 +       /* This should be a multiple of the cache line size */
35324 +       /* GDTs of other CPUs are now dynamically allocated */
35325 +
35326 +       /* zero the remaining page */
35327 +       .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
35328 +
35329 +       .section .bss, "aw", @nobits
35330 +       .align L1_CACHE_BYTES
35331 +ENTRY(idt_table)
35332 +       .skip 256 * 16
35333 +
35334 +       .section .bss.page_aligned, "aw", @nobits
35335 +       .align PAGE_SIZE
35336 +ENTRY(empty_zero_page)
35337 +       .skip PAGE_SIZE
35338 +
35339 +#ifdef CONFIG_XEN_COMPAT_030002
35340 +/*
35341 + * __xen_guest information
35342 + */
35343 +.macro utoh value
35344 + .if (\value) < 0 || (\value) >= 0x10
35345 +       utoh (((\value)>>4)&0x0fffffffffffffff)
35346 + .endif
35347 + .if ((\value) & 0xf) < 10
35348 +  .byte '0' + ((\value) & 0xf)
35349 + .else
35350 +  .byte 'A' + ((\value) & 0xf) - 10
35351 + .endif
35352 +.endm
35353 +
35354 +.section __xen_guest
35355 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
35356 +       .ascii  ",XEN_VER=xen-3.0"
35357 +       .ascii  ",VIRT_BASE=0x"
35358 +               utoh __START_KERNEL_map
35359 +       .ascii  ",ELF_PADDR_OFFSET=0x"
35360 +               utoh __START_KERNEL_map
35361 +       .ascii  ",VIRT_ENTRY=0x"
35362 +               utoh (__START_KERNEL_map + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
35363 +       .ascii  ",HYPERCALL_PAGE=0x"
35364 +               utoh (phys_hypercall_page >> PAGE_SHIFT)
35365 +       .ascii  ",FEATURES=writable_page_tables"
35366 +       .ascii           "|writable_descriptor_tables"
35367 +       .ascii           "|auto_translated_physmap"
35368 +       .ascii           "|supervisor_mode_kernel"
35369 +       .ascii  ",LOADER=generic"
35370 +       .byte   0
35371 +#endif /* CONFIG_XEN_COMPAT_030002 */
35372 +       
35373 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "linux")
35374 +       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "2.6")
35375 +       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
35376 +       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .quad,  __START_KERNEL_map)
35377 +#ifdef CONFIG_XEN_COMPAT_030002
35378 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad,  __START_KERNEL_map)
35379 +#else
35380 +       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad,  0)
35381 +#endif /* !CONFIG_XEN_COMPAT_030002 */
35382 +       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad,  startup_64)
35383 +       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad,  hypercall_page)
35384 +       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
35385 +       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
35386 diff -ruNp linux-2.6.19/arch/x86_64/kernel/head64-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/head64-xen.c
35387 --- linux-2.6.19/arch/x86_64/kernel/head64-xen.c        1970-01-01 00:00:00.000000000 +0000
35388 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/head64-xen.c      2007-02-02 19:10:26.000000000 +0000
35389 @@ -0,0 +1,119 @@
35390 +/*
35391 + *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
35392 + *
35393 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
35394 + *
35395 + *  Jun Nakajima <jun.nakajima@intel.com>
35396 + *     Modified for Xen.
35397 + */
35398 +
35399 +#include <linux/init.h>
35400 +#include <linux/linkage.h>
35401 +#include <linux/types.h>
35402 +#include <linux/kernel.h>
35403 +#include <linux/string.h>
35404 +#include <linux/percpu.h>
35405 +#include <linux/module.h>
35406 +
35407 +#include <asm/processor.h>
35408 +#include <asm/proto.h>
35409 +#include <asm/smp.h>
35410 +#include <asm/bootsetup.h>
35411 +#include <asm/setup.h>
35412 +#include <asm/desc.h>
35413 +#include <asm/pgtable.h>
35414 +#include <asm/sections.h>
35415 +
35416 +unsigned long start_pfn;
35417 +
35418 +/* Don't add a printk in there. printk relies on the PDA which is not initialized 
35419 +   yet. */
35420 +#if 0
35421 +static void __init clear_bss(void)
35422 +{
35423 +       memset(__bss_start, 0,
35424 +              (unsigned long) __bss_stop - (unsigned long) __bss_start);
35425 +}
35426 +#endif
35427 +
35428 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
35429 +#define OLD_CL_MAGIC_ADDR      0x90020
35430 +#define OLD_CL_MAGIC            0xA33F
35431 +#define OLD_CL_BASE_ADDR        0x90000
35432 +#define OLD_CL_OFFSET           0x90022
35433 +
35434 +extern char saved_command_line[];
35435 +
35436 +static void __init copy_bootdata(char *real_mode_data)
35437 +{
35438 +#ifndef CONFIG_XEN
35439 +       int new_data;
35440 +       char * command_line;
35441 +
35442 +       memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
35443 +       new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
35444 +       if (!new_data) {
35445 +               if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
35446 +                       return;
35447 +               }
35448 +               new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
35449 +       }
35450 +       command_line = (char *) ((u64)(new_data));
35451 +       memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
35452 +#else
35453 +       int max_cmdline;
35454 +       
35455 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
35456 +               max_cmdline = COMMAND_LINE_SIZE;
35457 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
35458 +       saved_command_line[max_cmdline-1] = '\0';
35459 +#endif
35460 +}
35461 +
35462 +#include <xen/interface/memory.h>
35463 +unsigned long *machine_to_phys_mapping;
35464 +EXPORT_SYMBOL(machine_to_phys_mapping);
35465 +unsigned int machine_to_phys_order;
35466 +EXPORT_SYMBOL(machine_to_phys_order);
35467 +
35468 +void __init x86_64_start_kernel(char * real_mode_data)
35469 +{
35470 +       struct xen_machphys_mapping mapping;
35471 +       unsigned long machine_to_phys_nr_ents;
35472 +       int i;
35473 +
35474 +       xen_start_info = (struct start_info *)real_mode_data;
35475 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
35476 +               phys_to_machine_mapping =
35477 +                       (unsigned long *)xen_start_info->mfn_list;
35478 +               start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
35479 +                       xen_start_info->nr_pt_frames;
35480 +       }
35481 +
35482 +       machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
35483 +       machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
35484 +       if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
35485 +               machine_to_phys_mapping = (unsigned long *)mapping.v_start;
35486 +               machine_to_phys_nr_ents = mapping.max_mfn + 1;
35487 +       }
35488 +       while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
35489 +               machine_to_phys_order++;
35490 +
35491 +#if 0
35492 +       for (i = 0; i < 256; i++)
35493 +               set_intr_gate(i, early_idt_handler);
35494 +       asm volatile("lidt %0" :: "m" (idt_descr));
35495 +#endif
35496 +
35497 +       early_printk("Kernel alive\n");
35498 +
35499 +       for (i = 0; i < NR_CPUS; i++)
35500 +               cpu_pda(i) = &boot_cpu_pda[i];
35501 +
35502 +       pda_init(0);
35503 +       copy_bootdata(real_mode_data);
35504 +#ifdef CONFIG_SMP
35505 +       cpu_set(0, cpu_online_map);
35506 +#endif
35507 +       start_kernel();
35508 +}
35509 diff -ruNp linux-2.6.19/arch/x86_64/kernel/init_task.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/init_task.c
35510 --- linux-2.6.19/arch/x86_64/kernel/init_task.c 2006-11-29 21:57:37.000000000 +0000
35511 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/init_task.c       2007-02-02 19:10:26.000000000 +0000
35512 @@ -37,6 +37,8 @@ union thread_union init_thread_union 
35513  struct task_struct init_task = INIT_TASK(init_task);
35514  
35515  EXPORT_SYMBOL(init_task);
35516 +
35517 +#ifndef CONFIG_X86_NO_TSS
35518  /*
35519   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
35520   * no more per-task TSS's. The TSS size is kept cacheline-aligned
35521 @@ -45,6 +47,7 @@ EXPORT_SYMBOL(init_task);
35522   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
35523   */ 
35524  DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
35525 +#endif
35526  
35527  /* Copies of the original ist values from the tss are only accessed during
35528   * debugging, no special alignment required.
35529 diff -ruNp linux-2.6.19/arch/x86_64/kernel/io_apic-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/io_apic-xen.c
35530 --- linux-2.6.19/arch/x86_64/kernel/io_apic-xen.c       1970-01-01 00:00:00.000000000 +0000
35531 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/io_apic-xen.c     2007-02-02 19:10:26.000000000 +0000
35532 @@ -0,0 +1,2182 @@
35533 +/*
35534 + *     Intel IO-APIC support for multi-Pentium hosts.
35535 + *
35536 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
35537 + *
35538 + *     Many thanks to Stig Venaas for trying out countless experimental
35539 + *     patches and reporting/debugging problems patiently!
35540 + *
35541 + *     (c) 1999, Multiple IO-APIC support, developed by
35542 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
35543 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
35544 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
35545 + *     and Ingo Molnar <mingo@redhat.com>
35546 + *
35547 + *     Fixes
35548 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
35549 + *                                     thanks to Eric Gilmore
35550 + *                                     and Rolf G. Tews
35551 + *                                     for testing these extensively
35552 + *     Paul Diefenbaugh        :       Added full ACPI support
35553 + */
35554 +
35555 +#include <linux/mm.h>
35556 +#include <linux/interrupt.h>
35557 +#include <linux/init.h>
35558 +#include <linux/delay.h>
35559 +#include <linux/sched.h>
35560 +#include <linux/smp_lock.h>
35561 +#include <linux/pci.h>
35562 +#include <linux/mc146818rtc.h>
35563 +#include <linux/acpi.h>
35564 +#include <linux/sysdev.h>
35565 +#include <linux/msi.h>
35566 +#include <linux/htirq.h>
35567 +#ifdef CONFIG_ACPI
35568 +#include <acpi/acpi_bus.h>
35569 +#endif
35570 +
35571 +#include <asm/io.h>
35572 +#include <asm/smp.h>
35573 +#include <asm/desc.h>
35574 +#include <asm/proto.h>
35575 +#include <asm/mach_apic.h>
35576 +#include <asm/acpi.h>
35577 +#include <asm/dma.h>
35578 +#include <asm/nmi.h>
35579 +#include <asm/msidef.h>
35580 +#include <asm/hypertransport.h>
35581 +
35582 +DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
35583 +       [0 ... FIRST_EXTERNAL_VECTOR - 1] = -1,
35584 +       [FIRST_EXTERNAL_VECTOR + 0] = 0,
35585 +       [FIRST_EXTERNAL_VECTOR + 1] = 1,
35586 +       [FIRST_EXTERNAL_VECTOR + 2] = 2,
35587 +       [FIRST_EXTERNAL_VECTOR + 3] = 3,
35588 +       [FIRST_EXTERNAL_VECTOR + 4] = 4,
35589 +       [FIRST_EXTERNAL_VECTOR + 5] = 5,
35590 +       [FIRST_EXTERNAL_VECTOR + 6] = 6,
35591 +       [FIRST_EXTERNAL_VECTOR + 7] = 7,
35592 +       [FIRST_EXTERNAL_VECTOR + 8] = 8,
35593 +       [FIRST_EXTERNAL_VECTOR + 9] = 9,
35594 +       [FIRST_EXTERNAL_VECTOR + 10] = 10,
35595 +       [FIRST_EXTERNAL_VECTOR + 11] = 11,
35596 +       [FIRST_EXTERNAL_VECTOR + 12] = 12,
35597 +       [FIRST_EXTERNAL_VECTOR + 13] = 13,
35598 +       [FIRST_EXTERNAL_VECTOR + 14] = 14,
35599 +       [FIRST_EXTERNAL_VECTOR + 15] = 15,
35600 +       [FIRST_EXTERNAL_VECTOR + 16 ... NR_VECTORS - 1] = -1
35601 +};
35602 +
35603 +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result);
35604 +
35605 +#define __apicdebuginit  __init
35606 +
35607 +int sis_apic_bug; /* not actually supported, dummy for compile */
35608 +
35609 +static int no_timer_check;
35610 +
35611 +static int disable_timer_pin_1 __initdata;
35612 +
35613 +int timer_over_8254 __initdata = 1;
35614 +#ifndef CONFIG_XEN
35615 +
35616 +/* Where if anywhere is the i8259 connect in external int mode */
35617 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
35618 +#endif
35619 +
35620 +static DEFINE_SPINLOCK(ioapic_lock);
35621 +DEFINE_SPINLOCK(vector_lock);
35622 +
35623 +/*
35624 + * # of IRQ routing registers
35625 + */
35626 +int nr_ioapic_registers[MAX_IO_APICS];
35627 +
35628 +/*
35629 + * Rough estimation of how many shared IRQs there are, can
35630 + * be changed anytime.
35631 + */
35632 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
35633 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
35634 +
35635 +/*
35636 + * This is performance-critical, we want to do it O(1)
35637 + *
35638 + * the indexing order of this array favors 1:1 mappings
35639 + * between pins and IRQs.
35640 + */
35641 +
35642 +static struct irq_pin_list {
35643 +       short apic, pin, next;
35644 +} irq_2_pin[PIN_MAP_SIZE];
35645 +
35646 +struct io_apic {
35647 +       unsigned int index;
35648 +       unsigned int unused[3];
35649 +       unsigned int data;
35650 +};
35651 +
35652 +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
35653 +{
35654 +       return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
35655 +               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
35656 +}
35657 +
35658 +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
35659 +{
35660 +       struct io_apic __iomem *io_apic = io_apic_base(apic);
35661 +       writel(reg, &io_apic->index);
35662 +       return readl(&io_apic->data);
35663 +}
35664 +
35665 +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
35666 +{
35667 +       struct io_apic __iomem *io_apic = io_apic_base(apic);
35668 +       writel(reg, &io_apic->index);
35669 +       writel(value, &io_apic->data);
35670 +}
35671 +
35672 +/*
35673 + * Re-write a value: to be used for read-modify-write
35674 + * cycles where the read already set up the index register.
35675 + */
35676 +static inline void io_apic_modify(unsigned int apic, unsigned int value)
35677 +{
35678 +       struct io_apic __iomem *io_apic = io_apic_base(apic);
35679 +       writel(value, &io_apic->data);
35680 +}
35681 +
35682 +/*
35683 + * Synchronize the IO-APIC and the CPU by doing
35684 + * a dummy read from the IO-APIC
35685 + */
35686 +static inline void io_apic_sync(unsigned int apic)
35687 +{
35688 +       struct io_apic __iomem *io_apic = io_apic_base(apic);
35689 +       readl(&io_apic->data);
35690 +}
35691 +
35692 +#ifdef CONFIG_XEN
35693 +
35694 +#include <xen/interface/xen.h>
35695 +#include <xen/interface/physdev.h>
35696 +
35697 +/* Fake i8259 */
35698 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
35699 +#define disable_8259A_irq(_irq)  ((void)0)
35700 +#define i8259A_irq_pending(_irq) (0)
35701 +
35702 +unsigned long io_apic_irqs;
35703 +
35704 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
35705 +{
35706 +       struct physdev_apic apic_op;
35707 +       int ret;
35708 +
35709 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
35710 +       apic_op.reg = reg;
35711 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
35712 +       if (ret)
35713 +               return ret;
35714 +       return apic_op.value;
35715 +}
35716 +
35717 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
35718 +{
35719 +       struct physdev_apic apic_op;
35720 +
35721 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
35722 +       apic_op.reg = reg;
35723 +       apic_op.value = value;
35724 +       HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
35725 +}
35726 +
35727 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
35728 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
35729 +
35730 +#define clear_IO_APIC() ((void)0)
35731 +#endif
35732 +
35733 +#ifndef CONFIG_XEN
35734 +#define __DO_ACTION(R, ACTION, FINAL)                                  \
35735 +                                                                       \
35736 +{                                                                      \
35737 +       int pin;                                                        \
35738 +       struct irq_pin_list *entry = irq_2_pin + irq;                   \
35739 +                                                                       \
35740 +       BUG_ON(irq >= NR_IRQS);                                         \
35741 +       for (;;) {                                                      \
35742 +               unsigned int reg;                                       \
35743 +               pin = entry->pin;                                       \
35744 +               if (pin == -1)                                          \
35745 +                       break;                                          \
35746 +               reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
35747 +               reg ACTION;                                             \
35748 +               io_apic_modify(entry->apic, reg);                       \
35749 +               if (!entry->next)                                       \
35750 +                       break;                                          \
35751 +               entry = irq_2_pin + entry->next;                        \
35752 +       }                                                               \
35753 +       FINAL;                                                          \
35754 +}
35755 +#endif /* !CONFIG_XEN */
35756 +
35757 +union entry_union {
35758 +       struct { u32 w1, w2; };
35759 +       struct IO_APIC_route_entry entry;
35760 +};
35761 +
35762 +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
35763 +{
35764 +       union entry_union eu;
35765 +       unsigned long flags;
35766 +       spin_lock_irqsave(&ioapic_lock, flags);
35767 +       eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
35768 +       eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
35769 +       spin_unlock_irqrestore(&ioapic_lock, flags);
35770 +       return eu.entry;
35771 +}
35772 +
35773 +/*
35774 + * When we write a new IO APIC routing entry, we need to write the high
35775 + * word first! If the mask bit in the low word is clear, we will enable
35776 + * the interrupt, and we need to make sure the entry is fully populated
35777 + * before that happens.
35778 + */
35779 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
35780 +{
35781 +       unsigned long flags;
35782 +       union entry_union eu;
35783 +       eu.entry = e;
35784 +       spin_lock_irqsave(&ioapic_lock, flags);
35785 +       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
35786 +       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
35787 +       spin_unlock_irqrestore(&ioapic_lock, flags);
35788 +}
35789 +
35790 +/*
35791 + * When we mask an IO APIC routing entry, we need to write the low
35792 + * word first, in order to set the mask bit before we change the
35793 + * high bits!
35794 + */
35795 +static void ioapic_mask_entry(int apic, int pin)
35796 +{
35797 +       unsigned long flags;
35798 +       union entry_union eu = { .entry.mask = 1 };
35799 +
35800 +       spin_lock_irqsave(&ioapic_lock, flags);
35801 +       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
35802 +       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
35803 +       spin_unlock_irqrestore(&ioapic_lock, flags);
35804 +}
35805 +
35806 +#ifndef CONFIG_XEN
35807 +#ifdef CONFIG_SMP
35808 +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
35809 +{
35810 +       int apic, pin;
35811 +       struct irq_pin_list *entry = irq_2_pin + irq;
35812 +
35813 +       BUG_ON(irq >= NR_IRQS);
35814 +       for (;;) {
35815 +               unsigned int reg;
35816 +               apic = entry->apic;
35817 +               pin = entry->pin;
35818 +               if (pin == -1)
35819 +                       break;
35820 +               io_apic_write(apic, 0x11 + pin*2, dest);
35821 +               reg = io_apic_read(apic, 0x10 + pin*2);
35822 +               reg &= ~0x000000ff;
35823 +               reg |= vector;
35824 +               io_apic_modify(apic, reg);
35825 +               if (!entry->next)
35826 +                       break;
35827 +               entry = irq_2_pin + entry->next;
35828 +       }
35829 +}
35830 +
35831 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
35832 +{
35833 +       unsigned long flags;
35834 +       unsigned int dest;
35835 +       cpumask_t tmp;
35836 +       int vector;
35837 +
35838 +       cpus_and(tmp, mask, cpu_online_map);
35839 +       if (cpus_empty(tmp))
35840 +               tmp = TARGET_CPUS;
35841 +
35842 +       cpus_and(mask, tmp, CPU_MASK_ALL);
35843 +
35844 +       vector = assign_irq_vector(irq, mask, &tmp);
35845 +       if (vector < 0)
35846 +               return;
35847 +
35848 +       dest = cpu_mask_to_apicid(tmp);
35849 +
35850 +       /*
35851 +        * Only the high 8 bits are valid.
35852 +        */
35853 +       dest = SET_APIC_LOGICAL_ID(dest);
35854 +
35855 +       spin_lock_irqsave(&ioapic_lock, flags);
35856 +       __target_IO_APIC_irq(irq, dest, vector);
35857 +       set_native_irq_info(irq, mask);
35858 +       spin_unlock_irqrestore(&ioapic_lock, flags);
35859 +}
35860 +#endif
35861 +#endif /* !CONFIG_XEN */
35862 +
35863 +/*
35864 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
35865 + * shared ISA-space IRQs, so we have to support them. We are super
35866 + * fast in the common case, and fast for shared ISA-space IRQs.
35867 + */
35868 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
35869 +{
35870 +       static int first_free_entry = NR_IRQS;
35871 +       struct irq_pin_list *entry = irq_2_pin + irq;
35872 +
35873 +       BUG_ON(irq >= NR_IRQS);
35874 +       while (entry->next)
35875 +               entry = irq_2_pin + entry->next;
35876 +
35877 +       if (entry->pin != -1) {
35878 +               entry->next = first_free_entry;
35879 +               entry = irq_2_pin + entry->next;
35880 +               if (++first_free_entry >= PIN_MAP_SIZE)
35881 +                       panic("io_apic.c: ran out of irq_2_pin entries!");
35882 +       }
35883 +       entry->apic = apic;
35884 +       entry->pin = pin;
35885 +}
35886 +
35887 +#ifndef CONFIG_XEN
35888 +
35889 +#define DO_ACTION(name,R,ACTION, FINAL)                                        \
35890 +                                                                       \
35891 +       static void name##_IO_APIC_irq (unsigned int irq)               \
35892 +       __DO_ACTION(R, ACTION, FINAL)
35893 +
35894 +DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
35895 +                                               /* mask = 1 */
35896 +DO_ACTION( __unmask,           0, &= 0xfffeffff, )
35897 +                                               /* mask = 0 */
35898 +
35899 +static void mask_IO_APIC_irq (unsigned int irq)
35900 +{
35901 +       unsigned long flags;
35902 +
35903 +       spin_lock_irqsave(&ioapic_lock, flags);
35904 +       __mask_IO_APIC_irq(irq);
35905 +       spin_unlock_irqrestore(&ioapic_lock, flags);
35906 +}
35907 +
35908 +static void unmask_IO_APIC_irq (unsigned int irq)
35909 +{
35910 +       unsigned long flags;
35911 +
35912 +       spin_lock_irqsave(&ioapic_lock, flags);
35913 +       __unmask_IO_APIC_irq(irq);
35914 +       spin_unlock_irqrestore(&ioapic_lock, flags);
35915 +}
35916 +
35917 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
35918 +{
35919 +       struct IO_APIC_route_entry entry;
35920 +
35921 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
35922 +       entry = ioapic_read_entry(apic, pin);
35923 +       if (entry.delivery_mode == dest_SMI)
35924 +               return;
35925 +       /*
35926 +        * Disable it in the IO-APIC irq-routing table:
35927 +        */
35928 +       ioapic_mask_entry(apic, pin);
35929 +}
35930 +
35931 +static void clear_IO_APIC (void)
35932 +{
35933 +       int apic, pin;
35934 +
35935 +       for (apic = 0; apic < nr_ioapics; apic++)
35936 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
35937 +                       clear_IO_APIC_pin(apic, pin);
35938 +}
35939 +
35940 +#endif /* !CONFIG_XEN */
35941 +int skip_ioapic_setup;
35942 +int ioapic_force;
35943 +
35944 +/* dummy parsing: see setup.c */
35945 +
35946 +static int __init disable_ioapic_setup(char *str)
35947 +{
35948 +       skip_ioapic_setup = 1;
35949 +       return 0;
35950 +}
35951 +early_param("noapic", disable_ioapic_setup);
35952 +
35953 +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
35954 +static int __init disable_timer_pin_setup(char *arg)
35955 +{
35956 +       disable_timer_pin_1 = 1;
35957 +       return 1;
35958 +}
35959 +__setup("disable_timer_pin_1", disable_timer_pin_setup);
35960 +
35961 +#ifndef CONFIG_XEN
35962 +static int __init setup_disable_8254_timer(char *s)
35963 +{
35964 +       timer_over_8254 = -1;
35965 +       return 1;
35966 +}
35967 +static int __init setup_enable_8254_timer(char *s)
35968 +{
35969 +       timer_over_8254 = 2;
35970 +       return 1;
35971 +}
35972 +
35973 +__setup("disable_8254_timer", setup_disable_8254_timer);
35974 +__setup("enable_8254_timer", setup_enable_8254_timer);
35975 +#endif /* !CONFIG_XEN */
35976 +
35977 +/*
35978 + * Find the IRQ entry number of a certain pin.
35979 + */
35980 +static int find_irq_entry(int apic, int pin, int type)
35981 +{
35982 +       int i;
35983 +
35984 +       for (i = 0; i < mp_irq_entries; i++)
35985 +               if (mp_irqs[i].mpc_irqtype == type &&
35986 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
35987 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
35988 +                   mp_irqs[i].mpc_dstirq == pin)
35989 +                       return i;
35990 +
35991 +       return -1;
35992 +}
35993 +
35994 +#ifndef CONFIG_XEN
35995 +/*
35996 + * Find the pin to which IRQ[irq] (ISA) is connected
35997 + */
35998 +static int __init find_isa_irq_pin(int irq, int type)
35999 +{
36000 +       int i;
36001 +
36002 +       for (i = 0; i < mp_irq_entries; i++) {
36003 +               int lbus = mp_irqs[i].mpc_srcbus;
36004 +
36005 +               if (test_bit(lbus, mp_bus_not_pci) &&
36006 +                   (mp_irqs[i].mpc_irqtype == type) &&
36007 +                   (mp_irqs[i].mpc_srcbusirq == irq))
36008 +
36009 +                       return mp_irqs[i].mpc_dstirq;
36010 +       }
36011 +       return -1;
36012 +}
36013 +
36014 +static int __init find_isa_irq_apic(int irq, int type)
36015 +{
36016 +       int i;
36017 +
36018 +       for (i = 0; i < mp_irq_entries; i++) {
36019 +               int lbus = mp_irqs[i].mpc_srcbus;
36020 +
36021 +               if (test_bit(lbus, mp_bus_not_pci) &&
36022 +                   (mp_irqs[i].mpc_irqtype == type) &&
36023 +                   (mp_irqs[i].mpc_srcbusirq == irq))
36024 +                       break;
36025 +       }
36026 +       if (i < mp_irq_entries) {
36027 +               int apic;
36028 +               for(apic = 0; apic < nr_ioapics; apic++) {
36029 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
36030 +                               return apic;
36031 +               }
36032 +       }
36033 +
36034 +       return -1;
36035 +}
36036 +#endif
36037 +
36038 +/*
36039 + * Find a specific PCI IRQ entry.
36040 + * Not an __init, possibly needed by modules
36041 + */
36042 +static int pin_2_irq(int idx, int apic, int pin);
36043 +
36044 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
36045 +{
36046 +       int apic, i, best_guess = -1;
36047 +
36048 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
36049 +               bus, slot, pin);
36050 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
36051 +               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
36052 +               return -1;
36053 +       }
36054 +       for (i = 0; i < mp_irq_entries; i++) {
36055 +               int lbus = mp_irqs[i].mpc_srcbus;
36056 +
36057 +               for (apic = 0; apic < nr_ioapics; apic++)
36058 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
36059 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
36060 +                               break;
36061 +
36062 +               if (!test_bit(lbus, mp_bus_not_pci) &&
36063 +                   !mp_irqs[i].mpc_irqtype &&
36064 +                   (bus == lbus) &&
36065 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
36066 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
36067 +
36068 +                       if (!(apic || IO_APIC_IRQ(irq)))
36069 +                               continue;
36070 +
36071 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
36072 +                               return irq;
36073 +                       /*
36074 +                        * Use the first all-but-pin matching entry as a
36075 +                        * best-guess fuzzy result for broken mptables.
36076 +                        */
36077 +                       if (best_guess < 0)
36078 +                               best_guess = irq;
36079 +               }
36080 +       }
36081 +       BUG_ON(best_guess >= NR_IRQS);
36082 +       return best_guess;
36083 +}
36084 +
36085 +/* ISA interrupts are always polarity zero edge triggered,
36086 + * when listed as conforming in the MP table. */
36087 +
36088 +#define default_ISA_trigger(idx)       (0)
36089 +#define default_ISA_polarity(idx)      (0)
36090 +
36091 +/* PCI interrupts are always polarity one level triggered,
36092 + * when listed as conforming in the MP table. */
36093 +
36094 +#define default_PCI_trigger(idx)       (1)
36095 +#define default_PCI_polarity(idx)      (1)
36096 +
36097 +static int __init MPBIOS_polarity(int idx)
36098 +{
36099 +       int bus = mp_irqs[idx].mpc_srcbus;
36100 +       int polarity;
36101 +
36102 +       /*
36103 +        * Determine IRQ line polarity (high active or low active):
36104 +        */
36105 +       switch (mp_irqs[idx].mpc_irqflag & 3)
36106 +       {
36107 +               case 0: /* conforms, ie. bus-type dependent polarity */
36108 +                       if (test_bit(bus, mp_bus_not_pci))
36109 +                               polarity = default_ISA_polarity(idx);
36110 +                       else
36111 +                               polarity = default_PCI_polarity(idx);
36112 +                       break;
36113 +               case 1: /* high active */
36114 +               {
36115 +                       polarity = 0;
36116 +                       break;
36117 +               }
36118 +               case 2: /* reserved */
36119 +               {
36120 +                       printk(KERN_WARNING "broken BIOS!!\n");
36121 +                       polarity = 1;
36122 +                       break;
36123 +               }
36124 +               case 3: /* low active */
36125 +               {
36126 +                       polarity = 1;
36127 +                       break;
36128 +               }
36129 +               default: /* invalid */
36130 +               {
36131 +                       printk(KERN_WARNING "broken BIOS!!\n");
36132 +                       polarity = 1;
36133 +                       break;
36134 +               }
36135 +       }
36136 +       return polarity;
36137 +}
36138 +
36139 +static int MPBIOS_trigger(int idx)
36140 +{
36141 +       int bus = mp_irqs[idx].mpc_srcbus;
36142 +       int trigger;
36143 +
36144 +       /*
36145 +        * Determine IRQ trigger mode (edge or level sensitive):
36146 +        */
36147 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
36148 +       {
36149 +               case 0: /* conforms, ie. bus-type dependent */
36150 +                       if (test_bit(bus, mp_bus_not_pci))
36151 +                               trigger = default_ISA_trigger(idx);
36152 +                       else
36153 +                               trigger = default_PCI_trigger(idx);
36154 +                       break;
36155 +               case 1: /* edge */
36156 +               {
36157 +                       trigger = 0;
36158 +                       break;
36159 +               }
36160 +               case 2: /* reserved */
36161 +               {
36162 +                       printk(KERN_WARNING "broken BIOS!!\n");
36163 +                       trigger = 1;
36164 +                       break;
36165 +               }
36166 +               case 3: /* level */
36167 +               {
36168 +                       trigger = 1;
36169 +                       break;
36170 +               }
36171 +               default: /* invalid */
36172 +               {
36173 +                       printk(KERN_WARNING "broken BIOS!!\n");
36174 +                       trigger = 0;
36175 +                       break;
36176 +               }
36177 +       }
36178 +       return trigger;
36179 +}
36180 +
36181 +static inline int irq_polarity(int idx)
36182 +{
36183 +       return MPBIOS_polarity(idx);
36184 +}
36185 +
36186 +static inline int irq_trigger(int idx)
36187 +{
36188 +       return MPBIOS_trigger(idx);
36189 +}
36190 +
36191 +static int pin_2_irq(int idx, int apic, int pin)
36192 +{
36193 +       int irq, i;
36194 +       int bus = mp_irqs[idx].mpc_srcbus;
36195 +
36196 +       /*
36197 +        * Debugging check, we are in big trouble if this message pops up!
36198 +        */
36199 +       if (mp_irqs[idx].mpc_dstirq != pin)
36200 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
36201 +
36202 +       if (test_bit(bus, mp_bus_not_pci)) {
36203 +               irq = mp_irqs[idx].mpc_srcbusirq;
36204 +       } else {
36205 +               /*
36206 +                * PCI IRQs are mapped in order
36207 +                */
36208 +               i = irq = 0;
36209 +               while (i < apic)
36210 +                       irq += nr_ioapic_registers[i++];
36211 +               irq += pin;
36212 +       }
36213 +       BUG_ON(irq >= NR_IRQS);
36214 +       return irq;
36215 +}
36216 +
36217 +static inline int IO_APIC_irq_trigger(int irq)
36218 +{
36219 +       int apic, idx, pin;
36220 +
36221 +       for (apic = 0; apic < nr_ioapics; apic++) {
36222 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
36223 +                       idx = find_irq_entry(apic,pin,mp_INT);
36224 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
36225 +                               return irq_trigger(idx);
36226 +               }
36227 +       }
36228 +       /*
36229 +        * nonexistent IRQs are edge default
36230 +        */
36231 +       return 0;
36232 +}
36233 +
36234 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
36235 +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
36236 +
36237 +static cpumask_t irq_domain[NR_IRQ_VECTORS] __read_mostly = {
36238 +       [0] = CPU_MASK_ALL,
36239 +       [1] = CPU_MASK_ALL,
36240 +       [2] = CPU_MASK_ALL,
36241 +       [3] = CPU_MASK_ALL,
36242 +       [4] = CPU_MASK_ALL,
36243 +       [5] = CPU_MASK_ALL,
36244 +       [6] = CPU_MASK_ALL,
36245 +       [7] = CPU_MASK_ALL,
36246 +       [8] = CPU_MASK_ALL,
36247 +       [9] = CPU_MASK_ALL,
36248 +       [10] = CPU_MASK_ALL,
36249 +       [11] = CPU_MASK_ALL,
36250 +       [12] = CPU_MASK_ALL,
36251 +       [13] = CPU_MASK_ALL,
36252 +       [14] = CPU_MASK_ALL,
36253 +       [15] = CPU_MASK_ALL,
36254 +};
36255 +
36256 +
36257 +static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
36258 +{
36259 +       struct physdev_irq irq_op;
36260 +       int vector;
36261 +
36262 +       BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
36263 +
36264 +       if (irq_vector[irq] > 0) {
36265 +               cpus_and(*result, irq_domain[irq], mask);
36266 +               return irq_vector[irq];
36267 +       }
36268 +       irq_op.irq = irq;
36269 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
36270 +               return -ENOSPC;
36271 +
36272 +       vector = irq_op.vector;
36273 +       per_cpu(vector_irq,0)[vector] = irq;
36274 +       irq_vector[irq] = vector;
36275 +       cpus_and(*result, irq_domain[irq], mask);
36276 +
36277 +       return vector;
36278 +}
36279 +
36280 +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
36281 +{
36282 +       int vector;
36283 +       unsigned long flags;
36284 +
36285 +       spin_lock_irqsave(&vector_lock, flags);
36286 +       vector = __assign_irq_vector(irq, mask, result);
36287 +       spin_unlock_irqrestore(&vector_lock, flags);
36288 +
36289 +       return vector;
36290 +}
36291 +
36292 +void __setup_vector_irq(int cpu)
36293 +{
36294 +       /* Initialize vector_irq on a new cpu */
36295 +       /* This function must be called with vector_lock held */
36296 +       int irq, vector;
36297 +
36298 +       /* Mark the inuse vectors */
36299 +       for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
36300 +               if (!cpu_isset(cpu, irq_domain[irq]))
36301 +                       continue;
36302 +               vector = irq_vector[irq];
36303 +               per_cpu(vector_irq, cpu)[vector] = irq;
36304 +       }
36305 +       /* Mark the free vectors */
36306 +       for (vector = 0; vector < NR_VECTORS; ++vector) {
36307 +               irq = per_cpu(vector_irq, cpu)[vector];
36308 +               if (irq < 0)
36309 +                       continue;
36310 +               if (!cpu_isset(cpu, irq_domain[irq]))
36311 +                       per_cpu(vector_irq, cpu)[vector] = -1;
36312 +       }
36313 +}
36314 +
36315 +
36316 +extern void (*interrupt[NR_IRQS])(void);
36317 +
36318 +#ifndef CONFIG_XEN
36319 +static struct irq_chip ioapic_chip;
36320 +
36321 +#define IOAPIC_AUTO    -1
36322 +#define IOAPIC_EDGE    0
36323 +#define IOAPIC_LEVEL   1
36324 +
36325 +static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
36326 +{
36327 +       if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
36328 +                       trigger == IOAPIC_LEVEL)
36329 +               set_irq_chip_and_handler_name(irq, &ioapic_chip,
36330 +                                             handle_fasteoi_irq, "fasteoi");
36331 +       else {
36332 +               irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
36333 +               set_irq_chip_and_handler_name(irq, &ioapic_chip,
36334 +                                             handle_edge_irq, "edge");
36335 +       }
36336 +}
36337 +#else
36338 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
36339 +#endif /* !CONFIG_XEN */
36340 +
36341 +static void __init setup_IO_APIC_irqs(void)
36342 +{
36343 +       struct IO_APIC_route_entry entry;
36344 +       int apic, pin, idx, irq, first_notcon = 1, vector;
36345 +       unsigned long flags;
36346 +
36347 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
36348 +
36349 +       for (apic = 0; apic < nr_ioapics; apic++) {
36350 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
36351 +
36352 +               /*
36353 +                * add it to the IO-APIC irq-routing table:
36354 +                */
36355 +               memset(&entry,0,sizeof(entry));
36356 +
36357 +               entry.delivery_mode = INT_DELIVERY_MODE;
36358 +               entry.dest_mode = INT_DEST_MODE;
36359 +               entry.mask = 0;                         /* enable IRQ */
36360 +               entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36361 +
36362 +               idx = find_irq_entry(apic,pin,mp_INT);
36363 +               if (idx == -1) {
36364 +                       if (first_notcon) {
36365 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
36366 +                               first_notcon = 0;
36367 +                       } else
36368 +                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
36369 +                       continue;
36370 +               }
36371 +
36372 +               entry.trigger = irq_trigger(idx);
36373 +               entry.polarity = irq_polarity(idx);
36374 +
36375 +               if (irq_trigger(idx)) {
36376 +                       entry.trigger = 1;
36377 +                       entry.mask = 1;
36378 +                       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36379 +               }
36380 +
36381 +               irq = pin_2_irq(idx, apic, pin);
36382 +               add_pin_to_irq(irq, apic, pin);
36383 +
36384 +               if (/* !apic && */ !IO_APIC_IRQ(irq))
36385 +                       continue;
36386 +
36387 +               if (IO_APIC_IRQ(irq)) {
36388 +                       cpumask_t mask;
36389 +                       vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
36390 +                       if (vector < 0)
36391 +                               continue;
36392 +
36393 +                       entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
36394 +                       entry.vector = vector;
36395 +
36396 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
36397 +                       if (!apic && (irq < 16))
36398 +                               disable_8259A_irq(irq);
36399 +               }
36400 +               ioapic_write_entry(apic, pin, entry);
36401 +
36402 +               spin_lock_irqsave(&ioapic_lock, flags);
36403 +               set_native_irq_info(irq, TARGET_CPUS);
36404 +               spin_unlock_irqrestore(&ioapic_lock, flags);
36405 +       }
36406 +       }
36407 +
36408 +       if (!first_notcon)
36409 +               apic_printk(APIC_VERBOSE," not connected.\n");
36410 +}
36411 +
36412 +#ifndef CONFIG_XEN
36413 +/*
36414 + * Set up the 8259A-master output pin as broadcast to all
36415 + * CPUs.
36416 + */
36417 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
36418 +{
36419 +       struct IO_APIC_route_entry entry;
36420 +       unsigned long flags;
36421 +
36422 +       memset(&entry,0,sizeof(entry));
36423 +
36424 +       disable_8259A_irq(0);
36425 +
36426 +       /* mask LVT0 */
36427 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
36428 +
36429 +       /*
36430 +        * We use logical delivery to get the timer IRQ
36431 +        * to the first CPU.
36432 +        */
36433 +       entry.dest_mode = INT_DEST_MODE;
36434 +       entry.mask = 0;                                 /* unmask IRQ now */
36435 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36436 +       entry.delivery_mode = INT_DELIVERY_MODE;
36437 +       entry.polarity = 0;
36438 +       entry.trigger = 0;
36439 +       entry.vector = vector;
36440 +
36441 +       /*
36442 +        * The timer IRQ doesn't have to know that behind the
36443 +        * scene we have a 8259A-master in AEOI mode ...
36444 +        */
36445 +       set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
36446 +
36447 +       /*
36448 +        * Add it to the IO-APIC irq-routing table:
36449 +        */
36450 +       spin_lock_irqsave(&ioapic_lock, flags);
36451 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
36452 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
36453 +       spin_unlock_irqrestore(&ioapic_lock, flags);
36454 +
36455 +       enable_8259A_irq(0);
36456 +}
36457 +
36458 +void __init UNEXPECTED_IO_APIC(void)
36459 +{
36460 +}
36461 +
36462 +void __apicdebuginit print_IO_APIC(void)
36463 +{
36464 +       int apic, i;
36465 +       union IO_APIC_reg_00 reg_00;
36466 +       union IO_APIC_reg_01 reg_01;
36467 +       union IO_APIC_reg_02 reg_02;
36468 +       unsigned long flags;
36469 +
36470 +       if (apic_verbosity == APIC_QUIET)
36471 +               return;
36472 +
36473 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
36474 +       for (i = 0; i < nr_ioapics; i++)
36475 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
36476 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
36477 +
36478 +       /*
36479 +        * We are a bit conservative about what we expect.  We have to
36480 +        * know about every hardware change ASAP.
36481 +        */
36482 +       printk(KERN_INFO "testing the IO APIC.......................\n");
36483 +
36484 +       for (apic = 0; apic < nr_ioapics; apic++) {
36485 +
36486 +       spin_lock_irqsave(&ioapic_lock, flags);
36487 +       reg_00.raw = io_apic_read(apic, 0);
36488 +       reg_01.raw = io_apic_read(apic, 1);
36489 +       if (reg_01.bits.version >= 0x10)
36490 +               reg_02.raw = io_apic_read(apic, 2);
36491 +       spin_unlock_irqrestore(&ioapic_lock, flags);
36492 +
36493 +       printk("\n");
36494 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
36495 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
36496 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
36497 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
36498 +               UNEXPECTED_IO_APIC();
36499 +
36500 +       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
36501 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
36502 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
36503 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
36504 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
36505 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
36506 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
36507 +               (reg_01.bits.entries != 0x2E) &&
36508 +               (reg_01.bits.entries != 0x3F) &&
36509 +               (reg_01.bits.entries != 0x03) 
36510 +       )
36511 +               UNEXPECTED_IO_APIC();
36512 +
36513 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
36514 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
36515 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
36516 +               (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
36517 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
36518 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
36519 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
36520 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
36521 +       )
36522 +               UNEXPECTED_IO_APIC();
36523 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
36524 +               UNEXPECTED_IO_APIC();
36525 +
36526 +       if (reg_01.bits.version >= 0x10) {
36527 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
36528 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
36529 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
36530 +                       UNEXPECTED_IO_APIC();
36531 +       }
36532 +
36533 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
36534 +
36535 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
36536 +                         " Stat Dest Deli Vect:   \n");
36537 +
36538 +       for (i = 0; i <= reg_01.bits.entries; i++) {
36539 +               struct IO_APIC_route_entry entry;
36540 +
36541 +               entry = ioapic_read_entry(apic, i);
36542 +
36543 +               printk(KERN_DEBUG " %02x %03X %02X  ",
36544 +                       i,
36545 +                       entry.dest.logical.logical_dest,
36546 +                       entry.dest.physical.physical_dest
36547 +               );
36548 +
36549 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
36550 +                       entry.mask,
36551 +                       entry.trigger,
36552 +                       entry.irr,
36553 +                       entry.polarity,
36554 +                       entry.delivery_status,
36555 +                       entry.dest_mode,
36556 +                       entry.delivery_mode,
36557 +                       entry.vector
36558 +               );
36559 +       }
36560 +       }
36561 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
36562 +       for (i = 0; i < NR_IRQS; i++) {
36563 +               struct irq_pin_list *entry = irq_2_pin + i;
36564 +               if (entry->pin < 0)
36565 +                       continue;
36566 +               printk(KERN_DEBUG "IRQ%d ", i);
36567 +               for (;;) {
36568 +                       printk("-> %d:%d", entry->apic, entry->pin);
36569 +                       if (!entry->next)
36570 +                               break;
36571 +                       entry = irq_2_pin + entry->next;
36572 +               }
36573 +               printk("\n");
36574 +       }
36575 +
36576 +       printk(KERN_INFO ".................................... done.\n");
36577 +
36578 +       return;
36579 +}
36580 +
36581 +#if 0
36582 +
36583 +static __apicdebuginit void print_APIC_bitfield (int base)
36584 +{
36585 +       unsigned int v;
36586 +       int i, j;
36587 +
36588 +       if (apic_verbosity == APIC_QUIET)
36589 +               return;
36590 +
36591 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
36592 +       for (i = 0; i < 8; i++) {
36593 +               v = apic_read(base + i*0x10);
36594 +               for (j = 0; j < 32; j++) {
36595 +                       if (v & (1<<j))
36596 +                               printk("1");
36597 +                       else
36598 +                               printk("0");
36599 +               }
36600 +               printk("\n");
36601 +       }
36602 +}
36603 +
36604 +void __apicdebuginit print_local_APIC(void * dummy)
36605 +{
36606 +       unsigned int v, ver, maxlvt;
36607 +
36608 +       if (apic_verbosity == APIC_QUIET)
36609 +               return;
36610 +
36611 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
36612 +               smp_processor_id(), hard_smp_processor_id());
36613 +       v = apic_read(APIC_ID);
36614 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
36615 +       v = apic_read(APIC_LVR);
36616 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
36617 +       ver = GET_APIC_VERSION(v);
36618 +       maxlvt = get_maxlvt();
36619 +
36620 +       v = apic_read(APIC_TASKPRI);
36621 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
36622 +
36623 +       v = apic_read(APIC_ARBPRI);
36624 +       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
36625 +               v & APIC_ARBPRI_MASK);
36626 +       v = apic_read(APIC_PROCPRI);
36627 +       printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
36628 +
36629 +       v = apic_read(APIC_EOI);
36630 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
36631 +       v = apic_read(APIC_RRR);
36632 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
36633 +       v = apic_read(APIC_LDR);
36634 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
36635 +       v = apic_read(APIC_DFR);
36636 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
36637 +       v = apic_read(APIC_SPIV);
36638 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
36639 +
36640 +       printk(KERN_DEBUG "... APIC ISR field:\n");
36641 +       print_APIC_bitfield(APIC_ISR);
36642 +       printk(KERN_DEBUG "... APIC TMR field:\n");
36643 +       print_APIC_bitfield(APIC_TMR);
36644 +       printk(KERN_DEBUG "... APIC IRR field:\n");
36645 +       print_APIC_bitfield(APIC_IRR);
36646 +
36647 +       v = apic_read(APIC_ESR);
36648 +       printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
36649 +
36650 +       v = apic_read(APIC_ICR);
36651 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
36652 +       v = apic_read(APIC_ICR2);
36653 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
36654 +
36655 +       v = apic_read(APIC_LVTT);
36656 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
36657 +
36658 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
36659 +               v = apic_read(APIC_LVTPC);
36660 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
36661 +       }
36662 +       v = apic_read(APIC_LVT0);
36663 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
36664 +       v = apic_read(APIC_LVT1);
36665 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
36666 +
36667 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
36668 +               v = apic_read(APIC_LVTERR);
36669 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
36670 +       }
36671 +
36672 +       v = apic_read(APIC_TMICT);
36673 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
36674 +       v = apic_read(APIC_TMCCT);
36675 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
36676 +       v = apic_read(APIC_TDCR);
36677 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
36678 +       printk("\n");
36679 +}
36680 +
36681 +void print_all_local_APICs (void)
36682 +{
36683 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
36684 +}
36685 +
36686 +void __apicdebuginit print_PIC(void)
36687 +{
36688 +       unsigned int v;
36689 +       unsigned long flags;
36690 +
36691 +       if (apic_verbosity == APIC_QUIET)
36692 +               return;
36693 +
36694 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
36695 +
36696 +       spin_lock_irqsave(&i8259A_lock, flags);
36697 +
36698 +       v = inb(0xa1) << 8 | inb(0x21);
36699 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
36700 +
36701 +       v = inb(0xa0) << 8 | inb(0x20);
36702 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
36703 +
36704 +       outb(0x0b,0xa0);
36705 +       outb(0x0b,0x20);
36706 +       v = inb(0xa0) << 8 | inb(0x20);
36707 +       outb(0x0a,0xa0);
36708 +       outb(0x0a,0x20);
36709 +
36710 +       spin_unlock_irqrestore(&i8259A_lock, flags);
36711 +
36712 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
36713 +
36714 +       v = inb(0x4d1) << 8 | inb(0x4d0);
36715 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
36716 +}
36717 +
36718 +#endif  /*  0  */
36719 +
36720 +#else
36721 +void __init print_IO_APIC(void) { }
36722 +#endif /* !CONFIG_XEN */
36723 +
36724 +static void __init enable_IO_APIC(void)
36725 +{
36726 +       union IO_APIC_reg_01 reg_01;
36727 +#ifndef CONFIG_XEN
36728 +       int i8259_apic, i8259_pin;
36729 +#endif
36730 +       int i, apic;
36731 +       unsigned long flags;
36732 +
36733 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
36734 +               irq_2_pin[i].pin = -1;
36735 +               irq_2_pin[i].next = 0;
36736 +       }
36737 +
36738 +       /*
36739 +        * The number of IO-APIC IRQ registers (== #pins):
36740 +        */
36741 +       for (apic = 0; apic < nr_ioapics; apic++) {
36742 +               spin_lock_irqsave(&ioapic_lock, flags);
36743 +               reg_01.raw = io_apic_read(apic, 1);
36744 +               spin_unlock_irqrestore(&ioapic_lock, flags);
36745 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
36746 +       }
36747 +#ifndef CONFIG_XEN
36748 +       for(apic = 0; apic < nr_ioapics; apic++) {
36749 +               int pin;
36750 +               /* See if any of the pins is in ExtINT mode */
36751 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
36752 +                       struct IO_APIC_route_entry entry;
36753 +                       entry = ioapic_read_entry(apic, pin);
36754 +
36755 +                       /* If the interrupt line is enabled and in ExtInt mode
36756 +                        * I have found the pin where the i8259 is connected.
36757 +                        */
36758 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
36759 +                               ioapic_i8259.apic = apic;
36760 +                               ioapic_i8259.pin  = pin;
36761 +                               goto found_i8259;
36762 +                       }
36763 +               }
36764 +       }
36765 + found_i8259:
36766 +       /* Look to see what if the MP table has reported the ExtINT */
36767 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
36768 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
36769 +       /* Trust the MP table if nothing is setup in the hardware */
36770 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
36771 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
36772 +               ioapic_i8259.pin  = i8259_pin;
36773 +               ioapic_i8259.apic = i8259_apic;
36774 +       }
36775 +       /* Complain if the MP table and the hardware disagree */
36776 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
36777 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
36778 +       {
36779 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
36780 +       }
36781 +#endif
36782 +
36783 +       /*
36784 +        * Do not trust the IO-APIC being empty at bootup
36785 +        */
36786 +       clear_IO_APIC();
36787 +}
36788 +
36789 +/*
36790 + * Not an __init, needed by the reboot code
36791 + */
36792 +void disable_IO_APIC(void)
36793 +{
36794 +       /*
36795 +        * Clear the IO-APIC before rebooting:
36796 +        */
36797 +       clear_IO_APIC();
36798 +
36799 +#ifndef CONFIG_XEN
36800 +       /*
36801 +        * If the i8259 is routed through an IOAPIC
36802 +        * Put that IOAPIC in virtual wire mode
36803 +        * so legacy interrupts can be delivered.
36804 +        */
36805 +       if (ioapic_i8259.pin != -1) {
36806 +               struct IO_APIC_route_entry entry;
36807 +
36808 +               memset(&entry, 0, sizeof(entry));
36809 +               entry.mask            = 0; /* Enabled */
36810 +               entry.trigger         = 0; /* Edge */
36811 +               entry.irr             = 0;
36812 +               entry.polarity        = 0; /* High */
36813 +               entry.delivery_status = 0;
36814 +               entry.dest_mode       = 0; /* Physical */
36815 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
36816 +               entry.vector          = 0;
36817 +               entry.dest.physical.physical_dest =
36818 +                                       GET_APIC_ID(apic_read(APIC_ID));
36819 +
36820 +               /*
36821 +                * Add it to the IO-APIC irq-routing table:
36822 +                */
36823 +               ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
36824 +       }
36825 +
36826 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
36827 +#endif
36828 +}
36829 +
36830 +/*
36831 + * There is a nasty bug in some older SMP boards, their mptable lies
36832 + * about the timer IRQ. We do the following to work around the situation:
36833 + *
36834 + *     - timer IRQ defaults to IO-APIC IRQ
36835 + *     - if this function detects that timer IRQs are defunct, then we fall
36836 + *       back to ISA timer IRQs
36837 + */
36838 +#ifndef CONFIG_XEN
36839 +static int __init timer_irq_works(void)
36840 +{
36841 +       unsigned long t1 = jiffies;
36842 +
36843 +       local_irq_enable();
36844 +       /* Let ten ticks pass... */
36845 +       mdelay((10 * 1000) / HZ);
36846 +
36847 +       /*
36848 +        * Expect a few ticks at least, to be sure some possible
36849 +        * glue logic does not lock up after one or two first
36850 +        * ticks in a non-ExtINT mode.  Also the local APIC
36851 +        * might have cached one ExtINT interrupt.  Finally, at
36852 +        * least one tick may be lost due to delays.
36853 +        */
36854 +
36855 +       /* jiffies wrap? */
36856 +       if (jiffies - t1 > 4)
36857 +               return 1;
36858 +       return 0;
36859 +}
36860 +
36861 +/*
36862 + * In the SMP+IOAPIC case it might happen that there are an unspecified
36863 + * number of pending IRQ events unhandled. These cases are very rare,
36864 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
36865 + * better to do it this way as thus we do not have to be aware of
36866 + * 'pending' interrupts in the IRQ path, except at this point.
36867 + */
36868 +/*
36869 + * Edge triggered needs to resend any interrupt
36870 + * that was delayed but this is now handled in the device
36871 + * independent code.
36872 + */
36873 +
36874 +/*
36875 + * Starting up a edge-triggered IO-APIC interrupt is
36876 + * nasty - we need to make sure that we get the edge.
36877 + * If it is already asserted for some reason, we need
36878 + * return 1 to indicate that is was pending.
36879 + *
36880 + * This is not complete - we should be able to fake
36881 + * an edge even if it isn't on the 8259A...
36882 + */
36883 +
36884 +static unsigned int startup_ioapic_irq(unsigned int irq)
36885 +{
36886 +       int was_pending = 0;
36887 +       unsigned long flags;
36888 +
36889 +       spin_lock_irqsave(&ioapic_lock, flags);
36890 +       if (irq < 16) {
36891 +               disable_8259A_irq(irq);
36892 +               if (i8259A_irq_pending(irq))
36893 +                       was_pending = 1;
36894 +       }
36895 +       __unmask_IO_APIC_irq(irq);
36896 +       spin_unlock_irqrestore(&ioapic_lock, flags);
36897 +
36898 +       return was_pending;
36899 +}
36900 +
36901 +static int ioapic_retrigger_irq(unsigned int irq)
36902 +{
36903 +       cpumask_t mask;
36904 +       unsigned vector;
36905 +       unsigned long flags;
36906 +
36907 +       spin_lock_irqsave(&vector_lock, flags);
36908 +       vector = irq_vector[irq];
36909 +       cpus_clear(mask);
36910 +       cpu_set(first_cpu(irq_domain[irq]), mask);
36911 +
36912 +       send_IPI_mask(mask, vector);
36913 +       spin_unlock_irqrestore(&vector_lock, flags);
36914 +
36915 +       return 1;
36916 +}
36917 +
36918 +/*
36919 + * Level and edge triggered IO-APIC interrupts need different handling,
36920 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
36921 + * handled with the level-triggered descriptor, but that one has slightly
36922 + * more overhead. Level-triggered interrupts cannot be handled with the
36923 + * edge-triggered handler, without risking IRQ storms and other ugly
36924 + * races.
36925 + */
36926 +
36927 +static void ack_apic_edge(unsigned int irq)
36928 +{
36929 +       move_native_irq(irq);
36930 +       ack_APIC_irq();
36931 +}
36932 +
36933 +static void ack_apic_level(unsigned int irq)
36934 +{
36935 +       int do_unmask_irq = 0;
36936 +
36937 +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
36938 +       /* If we are moving the irq we need to mask it */
36939 +       if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
36940 +               do_unmask_irq = 1;
36941 +               mask_IO_APIC_irq(irq);
36942 +       }
36943 +#endif
36944 +
36945 +       /*
36946 +        * We must acknowledge the irq before we move it or the acknowledge will
36947 +        * not propogate properly.
36948 +        */
36949 +       ack_APIC_irq();
36950 +
36951 +       /* Now we can move and renable the irq */
36952 +       move_masked_irq(irq);
36953 +       if (unlikely(do_unmask_irq))
36954 +               unmask_IO_APIC_irq(irq);
36955 +}
36956 +
36957 +static struct irq_chip ioapic_chip __read_mostly = {
36958 +       .name           = "IO-APIC",
36959 +       .startup        = startup_ioapic_irq,
36960 +       .mask           = mask_IO_APIC_irq,
36961 +       .unmask         = unmask_IO_APIC_irq,
36962 +       .ack            = ack_apic_edge,
36963 +       .eoi            = ack_apic_level,
36964 +#ifdef CONFIG_SMP
36965 +       .set_affinity   = set_ioapic_affinity_irq,
36966 +#endif
36967 +       .retrigger      = ioapic_retrigger_irq,
36968 +};
36969 +#endif /* !CONFIG_XEN */
36970 +
36971 +static inline void init_IO_APIC_traps(void)
36972 +{
36973 +       int irq;
36974 +
36975 +       /*
36976 +        * NOTE! The local APIC isn't very good at handling
36977 +        * multiple interrupts at the same interrupt level.
36978 +        * As the interrupt level is determined by taking the
36979 +        * vector number and shifting that right by 4, we
36980 +        * want to spread these out a bit so that they don't
36981 +        * all fall in the same interrupt level.
36982 +        *
36983 +        * Also, we've got to be careful not to trash gate
36984 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
36985 +        */
36986 +       for (irq = 0; irq < NR_IRQS ; irq++) {
36987 +               int tmp = irq;
36988 +               if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
36989 +                       /*
36990 +                        * Hmm.. We don't have an entry for this,
36991 +                        * so default to an old-fashioned 8259
36992 +                        * interrupt if we can..
36993 +                        */
36994 +                       if (irq < 16)
36995 +                               make_8259A_irq(irq);
36996 +#ifndef CONFIG_XEN
36997 +                       else
36998 +                               /* Strange. Oh, well.. */
36999 +                               irq_desc[irq].chip = &no_irq_chip;
37000 +#endif
37001 +               }
37002 +       }
37003 +}
37004 +
37005 +#ifndef CONFIG_XEN
37006 +static void enable_lapic_irq (unsigned int irq)
37007 +{
37008 +       unsigned long v;
37009 +
37010 +       v = apic_read(APIC_LVT0);
37011 +       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
37012 +}
37013 +
37014 +static void disable_lapic_irq (unsigned int irq)
37015 +{
37016 +       unsigned long v;
37017 +
37018 +       v = apic_read(APIC_LVT0);
37019 +       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
37020 +}
37021 +
37022 +static void ack_lapic_irq (unsigned int irq)
37023 +{
37024 +       ack_APIC_irq();
37025 +}
37026 +
37027 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
37028 +
37029 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
37030 +       .typename = "local-APIC-edge",
37031 +       .startup = NULL, /* startup_irq() not used for IRQ0 */
37032 +       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
37033 +       .enable = enable_lapic_irq,
37034 +       .disable = disable_lapic_irq,
37035 +       .ack = ack_lapic_irq,
37036 +       .end = end_lapic_irq,
37037 +};
37038 +
37039 +static void setup_nmi (void)
37040 +{
37041 +       /*
37042 +        * Dirty trick to enable the NMI watchdog ...
37043 +        * We put the 8259A master into AEOI mode and
37044 +        * unmask on all local APICs LVT0 as NMI.
37045 +        *
37046 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
37047 +        * is from Maciej W. Rozycki - so we do not have to EOI from
37048 +        * the NMI handler or the timer interrupt.
37049 +        */ 
37050 +       printk(KERN_INFO "activating NMI Watchdog ...");
37051 +
37052 +       enable_NMI_through_LVT0(NULL);
37053 +
37054 +       printk(" done.\n");
37055 +}
37056 +
37057 +/*
37058 + * This looks a bit hackish but it's about the only one way of sending
37059 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
37060 + * not support the ExtINT mode, unfortunately.  We need to send these
37061 + * cycles as some i82489DX-based boards have glue logic that keeps the
37062 + * 8259A interrupt line asserted until INTA.  --macro
37063 + */
37064 +static inline void unlock_ExtINT_logic(void)
37065 +{
37066 +       int apic, pin, i;
37067 +       struct IO_APIC_route_entry entry0, entry1;
37068 +       unsigned char save_control, save_freq_select;
37069 +       unsigned long flags;
37070 +
37071 +       pin  = find_isa_irq_pin(8, mp_INT);
37072 +       apic = find_isa_irq_apic(8, mp_INT);
37073 +       if (pin == -1)
37074 +               return;
37075 +
37076 +       spin_lock_irqsave(&ioapic_lock, flags);
37077 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
37078 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
37079 +       spin_unlock_irqrestore(&ioapic_lock, flags);
37080 +       clear_IO_APIC_pin(apic, pin);
37081 +
37082 +       memset(&entry1, 0, sizeof(entry1));
37083 +
37084 +       entry1.dest_mode = 0;                   /* physical delivery */
37085 +       entry1.mask = 0;                        /* unmask IRQ now */
37086 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
37087 +       entry1.delivery_mode = dest_ExtINT;
37088 +       entry1.polarity = entry0.polarity;
37089 +       entry1.trigger = 0;
37090 +       entry1.vector = 0;
37091 +
37092 +       spin_lock_irqsave(&ioapic_lock, flags);
37093 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
37094 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
37095 +       spin_unlock_irqrestore(&ioapic_lock, flags);
37096 +
37097 +       save_control = CMOS_READ(RTC_CONTROL);
37098 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
37099 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
37100 +                  RTC_FREQ_SELECT);
37101 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
37102 +
37103 +       i = 100;
37104 +       while (i-- > 0) {
37105 +               mdelay(10);
37106 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
37107 +                       i -= 10;
37108 +       }
37109 +
37110 +       CMOS_WRITE(save_control, RTC_CONTROL);
37111 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
37112 +       clear_IO_APIC_pin(apic, pin);
37113 +
37114 +       spin_lock_irqsave(&ioapic_lock, flags);
37115 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
37116 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
37117 +       spin_unlock_irqrestore(&ioapic_lock, flags);
37118 +}
37119 +
37120 +/*
37121 + * This code may look a bit paranoid, but it's supposed to cooperate with
37122 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
37123 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
37124 + * fanatically on his truly buggy board.
37125 + *
37126 + * FIXME: really need to revamp this for modern platforms only.
37127 + */
37128 +static inline void check_timer(void)
37129 +{
37130 +       int apic1, pin1, apic2, pin2;
37131 +       int vector;
37132 +       cpumask_t mask;
37133 +
37134 +       /*
37135 +        * get/set the timer IRQ vector:
37136 +        */
37137 +       disable_8259A_irq(0);
37138 +       vector = assign_irq_vector(0, TARGET_CPUS, &mask);
37139 +
37140 +       /*
37141 +        * Subtle, code in do_timer_interrupt() expects an AEOI
37142 +        * mode for the 8259A whenever interrupts are routed
37143 +        * through I/O APICs.  Also IRQ0 has to be enabled in
37144 +        * the 8259A which implies the virtual wire has to be
37145 +        * disabled in the local APIC.
37146 +        */
37147 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
37148 +       init_8259A(1);
37149 +       if (timer_over_8254 > 0)
37150 +               enable_8259A_irq(0);
37151 +
37152 +       pin1  = find_isa_irq_pin(0, mp_INT);
37153 +       apic1 = find_isa_irq_apic(0, mp_INT);
37154 +       pin2  = ioapic_i8259.pin;
37155 +       apic2 = ioapic_i8259.apic;
37156 +
37157 +       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
37158 +               vector, apic1, pin1, apic2, pin2);
37159 +
37160 +       if (pin1 != -1) {
37161 +               /*
37162 +                * Ok, does IRQ0 through the IOAPIC work?
37163 +                */
37164 +               unmask_IO_APIC_irq(0);
37165 +               if (!no_timer_check && timer_irq_works()) {
37166 +                       nmi_watchdog_default();
37167 +                       if (nmi_watchdog == NMI_IO_APIC) {
37168 +                               disable_8259A_irq(0);
37169 +                               setup_nmi();
37170 +                               enable_8259A_irq(0);
37171 +                       }
37172 +                       if (disable_timer_pin_1 > 0)
37173 +                               clear_IO_APIC_pin(0, pin1);
37174 +                       return;
37175 +               }
37176 +               clear_IO_APIC_pin(apic1, pin1);
37177 +               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
37178 +                               "connected to IO-APIC\n");
37179 +       }
37180 +
37181 +       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
37182 +                               "through the 8259A ... ");
37183 +       if (pin2 != -1) {
37184 +               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
37185 +                       apic2, pin2);
37186 +               /*
37187 +                * legacy devices should be connected to IO APIC #0
37188 +                */
37189 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
37190 +               if (timer_irq_works()) {
37191 +                       apic_printk(APIC_VERBOSE," works.\n");
37192 +                       nmi_watchdog_default();
37193 +                       if (nmi_watchdog == NMI_IO_APIC) {
37194 +                               setup_nmi();
37195 +                       }
37196 +                       return;
37197 +               }
37198 +               /*
37199 +                * Cleanup, just in case ...
37200 +                */
37201 +               clear_IO_APIC_pin(apic2, pin2);
37202 +       }
37203 +       apic_printk(APIC_VERBOSE," failed.\n");
37204 +
37205 +       if (nmi_watchdog == NMI_IO_APIC) {
37206 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
37207 +               nmi_watchdog = 0;
37208 +       }
37209 +
37210 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
37211 +
37212 +       disable_8259A_irq(0);
37213 +       irq_desc[0].chip = &lapic_irq_type;
37214 +       apic_write(APIC_LVT0, APIC_DM_FIXED | vector);  /* Fixed mode */
37215 +       enable_8259A_irq(0);
37216 +
37217 +       if (timer_irq_works()) {
37218 +               apic_printk(APIC_VERBOSE," works.\n");
37219 +               return;
37220 +       }
37221 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
37222 +       apic_printk(APIC_VERBOSE," failed.\n");
37223 +
37224 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
37225 +
37226 +       init_8259A(0);
37227 +       make_8259A_irq(0);
37228 +       apic_write(APIC_LVT0, APIC_DM_EXTINT);
37229 +
37230 +       unlock_ExtINT_logic();
37231 +
37232 +       if (timer_irq_works()) {
37233 +               apic_printk(APIC_VERBOSE," works.\n");
37234 +               return;
37235 +       }
37236 +       apic_printk(APIC_VERBOSE," failed :(.\n");
37237 +       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
37238 +}
37239 +#else
37240 +#define check_timer() ((void)0)
37241 +#endif /* !CONFIG_XEN */
37242 +
37243 +static int __init notimercheck(char *s)
37244 +{
37245 +       no_timer_check = 1;
37246 +       return 1;
37247 +}
37248 +__setup("no_timer_check", notimercheck);
37249 +
37250 +/*
37251 + *
37252 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
37253 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
37254 + *   Linux doesn't really care, as it's not actually used
37255 + *   for any interrupt handling anyway.
37256 + */
37257 +#define PIC_IRQS       (1<<2)
37258 +
37259 +void __init setup_IO_APIC(void)
37260 +{
37261 +       enable_IO_APIC();
37262 +
37263 +       if (acpi_ioapic)
37264 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
37265 +       else
37266 +               io_apic_irqs = ~PIC_IRQS;
37267 +
37268 +       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
37269 +
37270 +#ifndef CONFIG_XEN
37271 +       sync_Arb_IDs();
37272 +#endif /* !CONFIG_XEN */
37273 +       setup_IO_APIC_irqs();
37274 +       init_IO_APIC_traps();
37275 +       check_timer();
37276 +       if (!acpi_ioapic)
37277 +               print_IO_APIC();
37278 +}
37279 +
37280 +struct sysfs_ioapic_data {
37281 +       struct sys_device dev;
37282 +       struct IO_APIC_route_entry entry[0];
37283 +};
37284 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
37285 +
37286 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
37287 +{
37288 +       struct IO_APIC_route_entry *entry;
37289 +       struct sysfs_ioapic_data *data;
37290 +       int i;
37291 +
37292 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
37293 +       entry = data->entry;
37294 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
37295 +               *entry = ioapic_read_entry(dev->id, i);
37296 +
37297 +       return 0;
37298 +}
37299 +
37300 +static int ioapic_resume(struct sys_device *dev)
37301 +{
37302 +       struct IO_APIC_route_entry *entry;
37303 +       struct sysfs_ioapic_data *data;
37304 +       unsigned long flags;
37305 +       union IO_APIC_reg_00 reg_00;
37306 +       int i;
37307 +
37308 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
37309 +       entry = data->entry;
37310 +
37311 +       spin_lock_irqsave(&ioapic_lock, flags);
37312 +       reg_00.raw = io_apic_read(dev->id, 0);
37313 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
37314 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
37315 +               io_apic_write(dev->id, 0, reg_00.raw);
37316 +       }
37317 +       spin_unlock_irqrestore(&ioapic_lock, flags);
37318 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
37319 +               ioapic_write_entry(dev->id, i, entry[i]);
37320 +
37321 +       return 0;
37322 +}
37323 +
37324 +static struct sysdev_class ioapic_sysdev_class = {
37325 +       set_kset_name("ioapic"),
37326 +       .suspend = ioapic_suspend,
37327 +       .resume = ioapic_resume,
37328 +};
37329 +
37330 +static int __init ioapic_init_sysfs(void)
37331 +{
37332 +       struct sys_device * dev;
37333 +       int i, size, error = 0;
37334 +
37335 +       error = sysdev_class_register(&ioapic_sysdev_class);
37336 +       if (error)
37337 +               return error;
37338 +
37339 +       for (i = 0; i < nr_ioapics; i++ ) {
37340 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
37341 +                       * sizeof(struct IO_APIC_route_entry);
37342 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
37343 +               if (!mp_ioapic_data[i]) {
37344 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
37345 +                       continue;
37346 +               }
37347 +               memset(mp_ioapic_data[i], 0, size);
37348 +               dev = &mp_ioapic_data[i]->dev;
37349 +               dev->id = i;
37350 +               dev->cls = &ioapic_sysdev_class;
37351 +               error = sysdev_register(dev);
37352 +               if (error) {
37353 +                       kfree(mp_ioapic_data[i]);
37354 +                       mp_ioapic_data[i] = NULL;
37355 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
37356 +                       continue;
37357 +               }
37358 +       }
37359 +
37360 +       return 0;
37361 +}
37362 +
37363 +device_initcall(ioapic_init_sysfs);
37364 +
37365 +/*
37366 + * Dynamic irq allocate and deallocation
37367 + */
37368 +int create_irq(void)
37369 +{
37370 +       /* Allocate an unused irq */
37371 +       int irq;
37372 +       int new;
37373 +       int vector = 0;
37374 +       unsigned long flags;
37375 +       cpumask_t mask;
37376 +
37377 +       irq = -ENOSPC;
37378 +       spin_lock_irqsave(&vector_lock, flags);
37379 +       for (new = (NR_IRQS - 1); new >= 0; new--) {
37380 +               if (platform_legacy_irq(new))
37381 +                       continue;
37382 +               if (irq_vector[new] != 0)
37383 +                       continue;
37384 +               vector = __assign_irq_vector(new, TARGET_CPUS, &mask);
37385 +               if (likely(vector > 0))
37386 +                       irq = new;
37387 +               break;
37388 +       }
37389 +       spin_unlock_irqrestore(&vector_lock, flags);
37390 +
37391 +       if (irq >= 0) {
37392 +               dynamic_irq_init(irq);
37393 +       }
37394 +       return irq;
37395 +}
37396 +
37397 +void destroy_irq(unsigned int irq)
37398 +{
37399 +       unsigned long flags;
37400 +
37401 +       dynamic_irq_cleanup(irq);
37402 +
37403 +       spin_lock_irqsave(&vector_lock, flags);
37404 +       irq_vector[irq] = 0;
37405 +       spin_unlock_irqrestore(&vector_lock, flags);
37406 +}
37407 +
37408 +/*
37409 + * MSI mesage composition
37410 + */
37411 +#ifndef CONFIG_XEN
37412 +#ifdef CONFIG_PCI_MSI
37413 +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
37414 +{
37415 +       int vector;
37416 +       unsigned dest;
37417 +       cpumask_t tmp;
37418 +
37419 +       vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
37420 +       if (vector >= 0) {
37421 +               dest = cpu_mask_to_apicid(tmp);
37422 +
37423 +               msg->address_hi = MSI_ADDR_BASE_HI;
37424 +               msg->address_lo =
37425 +                       MSI_ADDR_BASE_LO |
37426 +                       ((INT_DEST_MODE == 0) ?
37427 +                               MSI_ADDR_DEST_MODE_PHYSICAL:
37428 +                               MSI_ADDR_DEST_MODE_LOGICAL) |
37429 +                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
37430 +                               MSI_ADDR_REDIRECTION_CPU:
37431 +                               MSI_ADDR_REDIRECTION_LOWPRI) |
37432 +                       MSI_ADDR_DEST_ID(dest);
37433 +
37434 +               msg->data =
37435 +                       MSI_DATA_TRIGGER_EDGE |
37436 +                       MSI_DATA_LEVEL_ASSERT |
37437 +                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
37438 +                               MSI_DATA_DELIVERY_FIXED:
37439 +                               MSI_DATA_DELIVERY_LOWPRI) |
37440 +                       MSI_DATA_VECTOR(vector);
37441 +       }
37442 +       return vector;
37443 +}
37444 +
37445 +#ifdef CONFIG_SMP
37446 +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
37447 +{
37448 +       struct msi_msg msg;
37449 +       unsigned int dest;
37450 +       cpumask_t tmp;
37451 +       int vector;
37452 +
37453 +       cpus_and(tmp, mask, cpu_online_map);
37454 +       if (cpus_empty(tmp))
37455 +               tmp = TARGET_CPUS;
37456 +
37457 +       cpus_and(mask, tmp, CPU_MASK_ALL);
37458 +
37459 +       vector = assign_irq_vector(irq, mask, &tmp);
37460 +       if (vector < 0)
37461 +               return;
37462 +
37463 +       dest = cpu_mask_to_apicid(tmp);
37464 +
37465 +       read_msi_msg(irq, &msg);
37466 +
37467 +       msg.data &= ~MSI_DATA_VECTOR_MASK;
37468 +       msg.data |= MSI_DATA_VECTOR(vector);
37469 +       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
37470 +       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
37471 +
37472 +       write_msi_msg(irq, &msg);
37473 +       set_native_irq_info(irq, mask);
37474 +}
37475 +#endif /* CONFIG_SMP */
37476 +
37477 +/*
37478 + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
37479 + * which implement the MSI or MSI-X Capability Structure.
37480 + */
37481 +static struct irq_chip msi_chip = {
37482 +       .name           = "PCI-MSI",
37483 +       .unmask         = unmask_msi_irq,
37484 +       .mask           = mask_msi_irq,
37485 +       .ack            = ack_apic_edge,
37486 +#ifdef CONFIG_SMP
37487 +       .set_affinity   = set_msi_irq_affinity,
37488 +#endif
37489 +       .retrigger      = ioapic_retrigger_irq,
37490 +};
37491 +
37492 +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
37493 +{
37494 +       struct msi_msg msg;
37495 +       int ret;
37496 +       ret = msi_compose_msg(dev, irq, &msg);
37497 +       if (ret < 0)
37498 +               return ret;
37499 +
37500 +       write_msi_msg(irq, &msg);
37501 +
37502 +       set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
37503 +
37504 +       return 0;
37505 +}
37506 +
37507 +void arch_teardown_msi_irq(unsigned int irq)
37508 +{
37509 +       return;
37510 +}
37511 +
37512 +#endif /* CONFIG_PCI_MSI */
37513 +#endif /* !CONFIG_XEN */
37514 +/*
37515 + * Hypertransport interrupt support
37516 + */
37517 +#ifdef CONFIG_HT_IRQ
37518 +
37519 +#ifdef CONFIG_SMP
37520 +
37521 +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
37522 +{
37523 +       struct ht_irq_msg msg;
37524 +       fetch_ht_irq_msg(irq, &msg);
37525 +
37526 +       msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
37527 +       msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
37528 +
37529 +       msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
37530 +       msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
37531 +
37532 +       write_ht_irq_msg(irq, &msg);
37533 +}
37534 +
37535 +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
37536 +{
37537 +       unsigned int dest;
37538 +       cpumask_t tmp;
37539 +       int vector;
37540 +
37541 +       cpus_and(tmp, mask, cpu_online_map);
37542 +       if (cpus_empty(tmp))
37543 +               tmp = TARGET_CPUS;
37544 +
37545 +       cpus_and(mask, tmp, CPU_MASK_ALL);
37546 +
37547 +       vector = assign_irq_vector(irq, mask, &tmp);
37548 +       if (vector < 0)
37549 +               return;
37550 +
37551 +       dest = cpu_mask_to_apicid(tmp);
37552 +
37553 +       target_ht_irq(irq, dest, vector);
37554 +       set_native_irq_info(irq, mask);
37555 +}
37556 +#endif
37557 +
37558 +static struct irq_chip ht_irq_chip = {
37559 +       .name           = "PCI-HT",
37560 +       .mask           = mask_ht_irq,
37561 +       .unmask         = unmask_ht_irq,
37562 +       .ack            = ack_apic_edge,
37563 +#ifdef CONFIG_SMP
37564 +       .set_affinity   = set_ht_irq_affinity,
37565 +#endif
37566 +       .retrigger      = ioapic_retrigger_irq,
37567 +};
37568 +
37569 +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
37570 +{
37571 +       int vector;
37572 +       cpumask_t tmp;
37573 +
37574 +       vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
37575 +       if (vector >= 0) {
37576 +               struct ht_irq_msg msg;
37577 +               unsigned dest;
37578 +
37579 +               dest = cpu_mask_to_apicid(tmp);
37580 +
37581 +               msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
37582 +
37583 +               msg.address_lo =
37584 +                       HT_IRQ_LOW_BASE |
37585 +                       HT_IRQ_LOW_DEST_ID(dest) |
37586 +                       HT_IRQ_LOW_VECTOR(vector) |
37587 +                       ((INT_DEST_MODE == 0) ?
37588 +                               HT_IRQ_LOW_DM_PHYSICAL :
37589 +                               HT_IRQ_LOW_DM_LOGICAL) |
37590 +                       HT_IRQ_LOW_RQEOI_EDGE |
37591 +                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
37592 +                               HT_IRQ_LOW_MT_FIXED :
37593 +                               HT_IRQ_LOW_MT_ARBITRATED) |
37594 +                       HT_IRQ_LOW_IRQ_MASKED;
37595 +
37596 +               write_ht_irq_msg(irq, &msg);
37597 +
37598 +               set_irq_chip_and_handler_name(irq, &ht_irq_chip,
37599 +                                             handle_edge_irq, "edge");
37600 +       }
37601 +       return vector;
37602 +}
37603 +#endif /* CONFIG_HT_IRQ */
37604 +
37605 +/* --------------------------------------------------------------------------
37606 +                          ACPI-based IOAPIC Configuration
37607 +   -------------------------------------------------------------------------- */
37608 +
37609 +#ifdef CONFIG_ACPI
37610 +
37611 +#define IO_APIC_MAX_ID         0xFE
37612 +
37613 +int __init io_apic_get_redir_entries (int ioapic)
37614 +{
37615 +       union IO_APIC_reg_01    reg_01;
37616 +       unsigned long flags;
37617 +
37618 +       spin_lock_irqsave(&ioapic_lock, flags);
37619 +       reg_01.raw = io_apic_read(ioapic, 1);
37620 +       spin_unlock_irqrestore(&ioapic_lock, flags);
37621 +
37622 +       return reg_01.bits.entries;
37623 +}
37624 +
37625 +
37626 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
37627 +{
37628 +       struct IO_APIC_route_entry entry;
37629 +       unsigned long flags;
37630 +       int vector;
37631 +       cpumask_t mask;
37632 +
37633 +       if (!IO_APIC_IRQ(irq)) {
37634 +               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
37635 +                       ioapic);
37636 +               return -EINVAL;
37637 +       }
37638 +
37639 +       /*
37640 +        * IRQs < 16 are already in the irq_2_pin[] map
37641 +        */
37642 +       if (irq >= 16)
37643 +               add_pin_to_irq(irq, ioapic, pin);
37644 +
37645 +
37646 +       vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
37647 +       if (vector < 0)
37648 +               return vector;
37649 +
37650 +       /*
37651 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
37652 +        * Note that we mask (disable) IRQs now -- these get enabled when the
37653 +        * corresponding device driver registers for this IRQ.
37654 +        */
37655 +
37656 +       memset(&entry,0,sizeof(entry));
37657 +
37658 +       entry.delivery_mode = INT_DELIVERY_MODE;
37659 +       entry.dest_mode = INT_DEST_MODE;
37660 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
37661 +       entry.trigger = triggering;
37662 +       entry.polarity = polarity;
37663 +       entry.mask = 1;                                  /* Disabled (masked) */
37664 +       entry.vector = vector & 0xff;
37665 +
37666 +       apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
37667 +               "IRQ %d Mode:%i Active:%i)\n", ioapic, 
37668 +              mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
37669 +              triggering, polarity);
37670 +
37671 +       ioapic_register_intr(irq, entry.vector, triggering);
37672 +
37673 +       if (!ioapic && (irq < 16))
37674 +               disable_8259A_irq(irq);
37675 +
37676 +       ioapic_write_entry(ioapic, pin, entry);
37677 +
37678 +       spin_lock_irqsave(&ioapic_lock, flags);
37679 +       set_native_irq_info(irq, TARGET_CPUS);
37680 +       spin_unlock_irqrestore(&ioapic_lock, flags);
37681 +
37682 +       return 0;
37683 +}
37684 +
37685 +#endif /* CONFIG_ACPI */
37686 +
37687 +
37688 +#ifndef CONFIG_XEN
37689 +/*
37690 + * This function currently is only a helper for the i386 smp boot process where
37691 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
37692 + * so mask in all cases should simply be TARGET_CPUS
37693 + */
37694 +#ifdef CONFIG_SMP
37695 +void __init setup_ioapic_dest(void)
37696 +{
37697 +       int pin, ioapic, irq, irq_entry;
37698 +
37699 +       if (skip_ioapic_setup == 1)
37700 +               return;
37701 +
37702 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
37703 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
37704 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
37705 +                       if (irq_entry == -1)
37706 +                               continue;
37707 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
37708 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
37709 +               }
37710 +
37711 +       }
37712 +}
37713 +#endif
37714 +#endif /* !CONFIG_XEN */
37715 diff -ruNp linux-2.6.19/arch/x86_64/kernel/ioport-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/ioport-xen.c
37716 --- linux-2.6.19/arch/x86_64/kernel/ioport-xen.c        1970-01-01 00:00:00.000000000 +0000
37717 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/ioport-xen.c      2007-02-02 19:10:26.000000000 +0000
37718 @@ -0,0 +1,99 @@
37719 +/*
37720 + *     linux/arch/x86_64/kernel/ioport.c
37721 + *
37722 + * This contains the io-permission bitmap code - written by obz, with changes
37723 + * by Linus.
37724 + */
37725 +
37726 +#include <linux/sched.h>
37727 +#include <linux/kernel.h>
37728 +#include <linux/capability.h>
37729 +#include <linux/errno.h>
37730 +#include <linux/types.h>
37731 +#include <linux/ioport.h>
37732 +#include <linux/mm.h>
37733 +#include <linux/smp.h>
37734 +#include <linux/smp_lock.h>
37735 +#include <linux/stddef.h>
37736 +#include <linux/slab.h>
37737 +#include <linux/thread_info.h>
37738 +#include <xen/interface/physdev.h>
37739 +
37740 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
37741 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
37742 +{
37743 +       int i;
37744 +               if (new_value)
37745 +               for (i = base; i < base + extent; i++) 
37746 +                       __set_bit(i, bitmap); 
37747 +               else
37748 +               for (i = base; i < base + extent; i++) 
37749 +                       clear_bit(i, bitmap); 
37750 +}
37751 +
37752 +/*
37753 + * this changes the io permissions bitmap in the current task.
37754 + */
37755 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
37756 +{
37757 +       struct thread_struct * t = &current->thread;
37758 +       unsigned long *bitmap;
37759 +       struct physdev_set_iobitmap set_iobitmap;
37760 +
37761 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
37762 +               return -EINVAL;
37763 +       if (turn_on && !capable(CAP_SYS_RAWIO))
37764 +               return -EPERM;
37765 +
37766 +       /*
37767 +        * If it's the first ioperm() call in this thread's lifetime, set the
37768 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
37769 +        * this is why we delay this operation until now:
37770 +        */
37771 +       if (!t->io_bitmap_ptr) {
37772 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
37773 +               if (!bitmap)
37774 +                       return -ENOMEM;
37775 +
37776 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
37777 +               t->io_bitmap_ptr = bitmap;
37778 +               set_thread_flag(TIF_IO_BITMAP);
37779 +
37780 +               set_iobitmap.bitmap   = (char *)bitmap;
37781 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
37782 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
37783 +       }
37784 +
37785 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
37786 +
37787 +       return 0;
37788 +}
37789 +
37790 +/*
37791 + * sys_iopl has to be used when you want to access the IO ports
37792 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
37793 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
37794 + *
37795 + */
37796 +
37797 +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
37798 +{
37799 +       unsigned int old = current->thread.iopl;
37800 +       struct physdev_set_iopl set_iopl;
37801 +
37802 +       if (level > 3)
37803 +               return -EINVAL;
37804 +       /* Trying to gain more privileges? */
37805 +       if (level > old) {
37806 +               if (!capable(CAP_SYS_RAWIO))
37807 +                       return -EPERM;
37808 +       }
37809 +       /* Change our version of the privilege levels. */
37810 +       current->thread.iopl = level;
37811 +
37812 +       /* Force the change at ring 0. */
37813 +       set_iopl.iopl = (level == 0) ? 1 : level;
37814 +       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
37815 +
37816 +       return 0;
37817 +}
37818 diff -ruNp linux-2.6.19/arch/x86_64/kernel/irq.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/irq.c
37819 --- linux-2.6.19/arch/x86_64/kernel/irq.c       2006-11-29 21:57:37.000000000 +0000
37820 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/irq.c     2007-02-02 19:10:26.000000000 +0000
37821 @@ -112,7 +112,11 @@ asmlinkage unsigned int do_IRQ(struct pt
37822  
37823         exit_idle();
37824         irq_enter();
37825 +#ifdef CONFIG_XEN
37826 +       irq = vector;
37827 +#else
37828         irq = __get_cpu_var(vector_irq)[vector];
37829 +#endif
37830  
37831  #ifdef CONFIG_DEBUG_STACKOVERFLOW
37832         stack_overflow_check(regs);
37833 diff -ruNp linux-2.6.19/arch/x86_64/kernel/irqflags-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/irqflags-xen.c
37834 --- linux-2.6.19/arch/x86_64/kernel/irqflags-xen.c      1970-01-01 00:00:00.000000000 +0000
37835 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/irqflags-xen.c    2007-02-02 19:10:26.000000000 +0000
37836 @@ -0,0 +1,100 @@
37837 +#include <linux/module.h>
37838 +#include <linux/smp.h>
37839 +#include <asm/irqflags.h>
37840 +#include <asm/hypervisor.h>
37841 +
37842 +/* 
37843 + * The use of 'barrier' in the following reflects their use as local-lock
37844 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
37845 + * critical operations are executed. All critical operations must complete
37846 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
37847 + * includes these barriers, for example.
37848 + */
37849 +
37850 +unsigned long __raw_local_save_flags(void)
37851 +{
37852 +       struct vcpu_info *_vcpu;
37853 +       unsigned long flags;
37854 +
37855 +       preempt_disable();
37856 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
37857 +       flags = _vcpu->evtchn_upcall_mask;
37858 +       preempt_enable();
37859 +
37860 +       return flags;
37861 +}
37862 +EXPORT_SYMBOL(__raw_local_save_flags);
37863 +
37864 +void raw_local_irq_restore(unsigned long flags)
37865 +{
37866 +       struct vcpu_info *_vcpu;
37867 +       preempt_disable();
37868 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
37869 +       if ((_vcpu->evtchn_upcall_mask = flags) == 0) {
37870 +               barrier(); /* unmask then check (avoid races) */
37871 +               if ( unlikely(_vcpu->evtchn_upcall_pending) )
37872 +                       force_evtchn_callback();
37873 +               preempt_enable();
37874 +       } else
37875 +               preempt_enable_no_resched();
37876 +}
37877 +EXPORT_SYMBOL(raw_local_irq_restore);
37878 +
37879 +void raw_local_irq_disable(void)
37880 +{
37881 +       struct vcpu_info *_vcpu;
37882 +
37883 +       preempt_disable();
37884 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
37885 +       _vcpu->evtchn_upcall_mask = 1;
37886 +       preempt_enable_no_resched();
37887 +}
37888 +EXPORT_SYMBOL(raw_local_irq_disable);
37889 +
37890 +void raw_local_irq_enable(void)
37891 +{
37892 +       struct vcpu_info *_vcpu;
37893 +
37894 +       preempt_disable();
37895 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
37896 +       _vcpu->evtchn_upcall_mask = 0;
37897 +       barrier(); /* unmask then check (avoid races) */
37898 +       if ( unlikely(_vcpu->evtchn_upcall_pending) )
37899 +               force_evtchn_callback();
37900 +       preempt_enable();
37901 +}
37902 +EXPORT_SYMBOL(raw_local_irq_enable);
37903 +
37904 +/*
37905 + * For spinlocks, etc.:
37906 + */
37907 +
37908 +unsigned long __raw_local_irq_save(void)
37909 +{
37910 +       struct vcpu_info *_vcpu;
37911 +       unsigned long flags;
37912 +
37913 +       preempt_disable();
37914 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
37915 +       flags = _vcpu->evtchn_upcall_mask;
37916 +       _vcpu->evtchn_upcall_mask = 1;
37917 +       preempt_enable_no_resched();
37918 +
37919 +       return flags;
37920 +}
37921 +EXPORT_SYMBOL(__raw_local_irq_save);
37922 +
37923 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
37924 +int raw_irqs_disabled(void)
37925 +{
37926 +       struct vcpu_info *_vcpu;
37927 +       int disabled;
37928 +
37929 +       preempt_disable();
37930 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];
37931 +       disabled = (_vcpu->evtchn_upcall_mask != 0);
37932 +       preempt_enable_no_resched();
37933 +
37934 +       return disabled;
37935 +}
37936 +EXPORT_SYMBOL(raw_irqs_disabled);
37937 diff -ruNp linux-2.6.19/arch/x86_64/kernel/ldt-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/ldt-xen.c
37938 --- linux-2.6.19/arch/x86_64/kernel/ldt-xen.c   1970-01-01 00:00:00.000000000 +0000
37939 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/ldt-xen.c 2007-02-02 19:10:26.000000000 +0000
37940 @@ -0,0 +1,283 @@
37941 +/*
37942 + * linux/arch/x86_64/kernel/ldt.c
37943 + *
37944 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
37945 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
37946 + * Copyright (C) 2002 Andi Kleen
37947 + * 
37948 + * This handles calls from both 32bit and 64bit mode.
37949 + */
37950 +
37951 +#include <linux/errno.h>
37952 +#include <linux/sched.h>
37953 +#include <linux/string.h>
37954 +#include <linux/mm.h>
37955 +#include <linux/smp.h>
37956 +#include <linux/smp_lock.h>
37957 +#include <linux/vmalloc.h>
37958 +#include <linux/slab.h>
37959 +
37960 +#include <asm/uaccess.h>
37961 +#include <asm/system.h>
37962 +#include <asm/ldt.h>
37963 +#include <asm/desc.h>
37964 +#include <asm/proto.h>
37965 +#include <asm/pgalloc.h>
37966 +
37967 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
37968 +static void flush_ldt(void *null)
37969 +{
37970 +       if (current->active_mm)
37971 +               load_LDT(&current->active_mm->context);
37972 +}
37973 +#endif
37974 +
37975 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
37976 +{
37977 +       void *oldldt;
37978 +       void *newldt;
37979 +       unsigned oldsize;
37980 +
37981 +       if (mincount <= (unsigned)pc->size)
37982 +               return 0;
37983 +       oldsize = pc->size;
37984 +       mincount = (mincount+511)&(~511);
37985 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
37986 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
37987 +       else
37988 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
37989 +
37990 +       if (!newldt)
37991 +               return -ENOMEM;
37992 +
37993 +       if (oldsize)
37994 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
37995 +       oldldt = pc->ldt;
37996 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
37997 +       wmb();
37998 +       pc->ldt = newldt;
37999 +       wmb();
38000 +       pc->size = mincount;
38001 +       wmb();
38002 +       if (reload) {
38003 +#ifdef CONFIG_SMP
38004 +               cpumask_t mask;
38005 +
38006 +               preempt_disable();
38007 +               mask = cpumask_of_cpu(smp_processor_id());
38008 +#endif
38009 +               make_pages_readonly(
38010 +                       pc->ldt,
38011 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
38012 +                       XENFEAT_writable_descriptor_tables);
38013 +               load_LDT(pc);
38014 +#ifdef CONFIG_SMP
38015 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
38016 +                       smp_call_function(flush_ldt, NULL, 1, 1);
38017 +               preempt_enable();
38018 +#endif
38019 +       }
38020 +       if (oldsize) {
38021 +               make_pages_writable(
38022 +                       oldldt,
38023 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
38024 +                       XENFEAT_writable_descriptor_tables);
38025 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
38026 +                       vfree(oldldt);
38027 +               else
38028 +                       kfree(oldldt);
38029 +       }
38030 +       return 0;
38031 +}
38032 +
38033 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
38034 +{
38035 +       int err = alloc_ldt(new, old->size, 0);
38036 +       if (err < 0)
38037 +               return err;
38038 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
38039 +       make_pages_readonly(
38040 +               new->ldt,
38041 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
38042 +               XENFEAT_writable_descriptor_tables);
38043 +       return 0;
38044 +}
38045 +
38046 +/*
38047 + * we do not have to muck with descriptors here, that is
38048 + * done in switch_mm() as needed.
38049 + */
38050 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
38051 +{
38052 +       struct mm_struct * old_mm;
38053 +       int retval = 0;
38054 +
38055 +       memset(&mm->context, 0, sizeof(mm->context));
38056 +       init_MUTEX(&mm->context.sem);
38057 +       mm->context.size = 0;
38058 +       old_mm = current->mm;
38059 +       if (old_mm && old_mm->context.size > 0) {
38060 +               down(&old_mm->context.sem);
38061 +               retval = copy_ldt(&mm->context, &old_mm->context);
38062 +               up(&old_mm->context.sem);
38063 +       }
38064 +       if (retval == 0) {
38065 +               spin_lock(&mm_unpinned_lock);
38066 +               list_add(&mm->context.unpinned, &mm_unpinned);
38067 +               spin_unlock(&mm_unpinned_lock);
38068 +       }
38069 +       return retval;
38070 +}
38071 +
38072 +/*
38073 + * 
38074 + * Don't touch the LDT register - we're already in the next thread.
38075 + */
38076 +void destroy_context(struct mm_struct *mm)
38077 +{
38078 +       if (mm->context.size) {
38079 +               if (mm == current->active_mm)
38080 +                       clear_LDT();
38081 +               make_pages_writable(
38082 +                       mm->context.ldt,
38083 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
38084 +                       XENFEAT_writable_descriptor_tables);
38085 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
38086 +                       vfree(mm->context.ldt);
38087 +               else
38088 +                       kfree(mm->context.ldt);
38089 +               mm->context.size = 0;
38090 +       }
38091 +       if (!mm->context.pinned) {
38092 +               spin_lock(&mm_unpinned_lock);
38093 +               list_del(&mm->context.unpinned);
38094 +               spin_unlock(&mm_unpinned_lock);
38095 +       }
38096 +}
38097 +
38098 +static int read_ldt(void __user * ptr, unsigned long bytecount)
38099 +{
38100 +       int err;
38101 +       unsigned long size;
38102 +       struct mm_struct * mm = current->mm;
38103 +
38104 +       if (!mm->context.size)
38105 +               return 0;
38106 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
38107 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
38108 +
38109 +       down(&mm->context.sem);
38110 +       size = mm->context.size*LDT_ENTRY_SIZE;
38111 +       if (size > bytecount)
38112 +               size = bytecount;
38113 +
38114 +       err = 0;
38115 +       if (copy_to_user(ptr, mm->context.ldt, size))
38116 +               err = -EFAULT;
38117 +       up(&mm->context.sem);
38118 +       if (err < 0)
38119 +               goto error_return;
38120 +       if (size != bytecount) {
38121 +               /* zero-fill the rest */
38122 +               if (clear_user(ptr+size, bytecount-size) != 0) {
38123 +                       err = -EFAULT;
38124 +                       goto error_return;
38125 +               }
38126 +       }
38127 +       return bytecount;
38128 +error_return:
38129 +       return err;
38130 +}
38131 +
38132 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
38133 +{
38134 +       /* Arbitrary number */ 
38135 +       /* x86-64 default LDT is all zeros */
38136 +       if (bytecount > 128) 
38137 +               bytecount = 128;        
38138 +       if (clear_user(ptr, bytecount))
38139 +               return -EFAULT;
38140 +       return bytecount; 
38141 +}
38142 +
38143 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
38144 +{
38145 +       struct task_struct *me = current;
38146 +       struct mm_struct * mm = me->mm;
38147 +       __u32 entry_1, entry_2, *lp;
38148 +       unsigned long mach_lp;
38149 +       int error;
38150 +       struct user_desc ldt_info;
38151 +
38152 +       error = -EINVAL;
38153 +
38154 +       if (bytecount != sizeof(ldt_info))
38155 +               goto out;
38156 +       error = -EFAULT;        
38157 +       if (copy_from_user(&ldt_info, ptr, bytecount))
38158 +               goto out;
38159 +
38160 +       error = -EINVAL;
38161 +       if (ldt_info.entry_number >= LDT_ENTRIES)
38162 +               goto out;
38163 +       if (ldt_info.contents == 3) {
38164 +               if (oldmode)
38165 +                       goto out;
38166 +               if (ldt_info.seg_not_present == 0)
38167 +                       goto out;
38168 +       }
38169 +
38170 +       down(&mm->context.sem);
38171 +       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
38172 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
38173 +               if (error < 0)
38174 +                       goto out_unlock;
38175 +       }
38176 +
38177 +       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
38178 +       mach_lp = arbitrary_virt_to_machine(lp);
38179 +
38180 +       /* Allow LDTs to be cleared by the user. */
38181 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
38182 +               if (oldmode || LDT_empty(&ldt_info)) {
38183 +                       entry_1 = 0;
38184 +                       entry_2 = 0;
38185 +                       goto install;
38186 +               }
38187 +       }
38188 +
38189 +       entry_1 = LDT_entry_a(&ldt_info);
38190 +       entry_2 = LDT_entry_b(&ldt_info);
38191 +       if (oldmode)
38192 +               entry_2 &= ~(1 << 20);
38193 +
38194 +       /* Install the new entry ...  */
38195 +install:
38196 +       error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
38197 +
38198 +out_unlock:
38199 +       up(&mm->context.sem);
38200 +out:
38201 +       return error;
38202 +}
38203 +
38204 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
38205 +{
38206 +       int ret = -ENOSYS;
38207 +
38208 +       switch (func) {
38209 +       case 0:
38210 +               ret = read_ldt(ptr, bytecount);
38211 +               break;
38212 +       case 1:
38213 +               ret = write_ldt(ptr, bytecount, 1);
38214 +               break;
38215 +       case 2:
38216 +               ret = read_default_ldt(ptr, bytecount);
38217 +               break;
38218 +       case 0x11:
38219 +               ret = write_ldt(ptr, bytecount, 0);
38220 +               break;
38221 +       }
38222 +       return ret;
38223 +}
38224 diff -ruNp linux-2.6.19/arch/x86_64/kernel/machine_kexec.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/machine_kexec.c
38225 --- linux-2.6.19/arch/x86_64/kernel/machine_kexec.c     2006-11-29 21:57:37.000000000 +0000
38226 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/machine_kexec.c   2007-02-02 19:10:26.000000000 +0000
38227 @@ -24,6 +24,104 @@ static u64 kexec_pud1[512] PAGE_ALIGNED;
38228  static u64 kexec_pmd1[512] PAGE_ALIGNED;
38229  static u64 kexec_pte1[512] PAGE_ALIGNED;
38230  
38231 +#ifdef CONFIG_XEN
38232 +
38233 +/* In the case of Xen, override hypervisor functions to be able to create
38234 + * a regular identity mapping page table...
38235 + */
38236 +
38237 +#include <xen/interface/kexec.h>
38238 +#include <xen/interface/memory.h>
38239 +
38240 +#define x__pmd(x) ((pmd_t) { (x) } )
38241 +#define x__pud(x) ((pud_t) { (x) } )
38242 +#define x__pgd(x) ((pgd_t) { (x) } )
38243 +
38244 +#define x_pmd_val(x)   ((x).pmd)
38245 +#define x_pud_val(x)   ((x).pud)
38246 +#define x_pgd_val(x)   ((x).pgd)
38247 +
38248 +static inline void x_set_pmd(pmd_t *dst, pmd_t val)
38249 +{
38250 +       x_pmd_val(*dst) = x_pmd_val(val);
38251 +}
38252 +
38253 +static inline void x_set_pud(pud_t *dst, pud_t val)
38254 +{
38255 +       x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
38256 +}
38257 +
38258 +static inline void x_pud_clear (pud_t *pud)
38259 +{
38260 +       x_pud_val(*pud) = 0;
38261 +}
38262 +
38263 +static inline void x_set_pgd(pgd_t *dst, pgd_t val)
38264 +{
38265 +       x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
38266 +}
38267 +
38268 +static inline void x_pgd_clear (pgd_t * pgd)
38269 +{
38270 +       x_pgd_val(*pgd) = 0;
38271 +}
38272 +
38273 +#define X__PAGE_KERNEL_LARGE_EXEC \
38274 +         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
38275 +#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
38276 +
38277 +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
38278 +
38279 +#if PAGES_NR > KEXEC_XEN_NO_PAGES
38280 +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
38281 +#endif
38282 +
38283 +#if PA_CONTROL_PAGE != 0
38284 +#error PA_CONTROL_PAGE is non zero - Xen support will break
38285 +#endif
38286 +
38287 +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
38288 +{
38289 +       void *control_page;
38290 +       void *table_page;
38291 +
38292 +       memset(xki->page_list, 0, sizeof(xki->page_list));
38293 +
38294 +       control_page = page_address(image->control_code_page) + PAGE_SIZE;
38295 +       memcpy(control_page, relocate_kernel, PAGE_SIZE);
38296 +
38297 +       table_page = page_address(image->control_code_page);
38298 +
38299 +       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
38300 +       xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
38301 +
38302 +       xki->page_list[PA_PGD] = __ma(kexec_pgd);
38303 +       xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
38304 +       xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
38305 +       xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
38306 +       xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
38307 +       xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
38308 +       xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
38309 +}
38310 +
38311 +#else /* CONFIG_XEN */
38312 +
38313 +#define x__pmd(x) __pmd(x)
38314 +#define x__pud(x) __pud(x)
38315 +#define x__pgd(x) __pgd(x)
38316 +
38317 +#define x_set_pmd(x, y) set_pmd(x, y)
38318 +#define x_set_pud(x, y) set_pud(x, y)
38319 +#define x_set_pgd(x, y) set_pgd(x, y)
38320 +
38321 +#define x_pud_clear(x) pud_clear(x)
38322 +#define x_pgd_clear(x) pgd_clear(x)
38323 +
38324 +#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
38325 +#define X_KERNPG_TABLE _KERNPG_TABLE
38326 +
38327 +#endif /* CONFIG_XEN */
38328 +
38329  static void init_level2_page(pmd_t *level2p, unsigned long addr)
38330  {
38331         unsigned long end_addr;
38332 @@ -31,7 +129,7 @@ static void init_level2_page(pmd_t *leve
38333         addr &= PAGE_MASK;
38334         end_addr = addr + PUD_SIZE;
38335         while (addr < end_addr) {
38336 -               set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
38337 +               x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
38338                 addr += PMD_SIZE;
38339         }
38340  }
38341 @@ -56,12 +154,12 @@ static int init_level3_page(struct kimag
38342                 }
38343                 level2p = (pmd_t *)page_address(page);
38344                 init_level2_page(level2p, addr);
38345 -               set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
38346 +               x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
38347                 addr += PUD_SIZE;
38348         }
38349         /* clear the unused entries */
38350         while (addr < end_addr) {
38351 -               pud_clear(level3p++);
38352 +               x_pud_clear(level3p++);
38353                 addr += PUD_SIZE;
38354         }
38355  out:
38356 @@ -92,12 +190,12 @@ static int init_level4_page(struct kimag
38357                 if (result) {
38358                         goto out;
38359                 }
38360 -               set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
38361 +               x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
38362                 addr += PGDIR_SIZE;
38363         }
38364         /* clear the unused entries */
38365         while (addr < end_addr) {
38366 -               pgd_clear(level4p++);
38367 +               x_pgd_clear(level4p++);
38368                 addr += PGDIR_SIZE;
38369         }
38370  out:
38371 @@ -108,8 +206,14 @@ out:
38372  static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
38373  {
38374         pgd_t *level4p;
38375 +       unsigned long x_end_pfn = end_pfn;
38376 +
38377 +#ifdef CONFIG_XEN
38378 +       x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
38379 +#endif
38380 +
38381         level4p = (pgd_t *)__va(start_pgtable);
38382 -       return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
38383 +       return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
38384  }
38385  
38386  static void set_idt(void *newidt, u16 limit)
38387 @@ -174,6 +278,7 @@ void machine_kexec_cleanup(struct kimage
38388         return;
38389  }
38390  
38391 +#ifndef CONFIG_XEN
38392  /*
38393   * Do not allocate memory (or fail in any way) in machine_kexec().
38394   * We are past the point of no return, committed to rebooting now.
38395 @@ -229,6 +334,7 @@ NORET_TYPE void machine_kexec(struct kim
38396         relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
38397                         image->start);
38398  }
38399 +#endif
38400  
38401  /* crashkernel=size@addr specifies the location to reserve for
38402   * a crash kernel.  By reserving this memory we guarantee
38403 @@ -256,4 +362,3 @@ static int __init setup_crashkernel(char
38404         return 0;
38405  }
38406  early_param("crashkernel", setup_crashkernel);
38407 -
38408 diff -ruNp linux-2.6.19/arch/x86_64/kernel/mpparse-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/mpparse-xen.c
38409 --- linux-2.6.19/arch/x86_64/kernel/mpparse-xen.c       1970-01-01 00:00:00.000000000 +0000
38410 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/mpparse-xen.c     2007-02-02 19:10:26.000000000 +0000
38411 @@ -0,0 +1,850 @@
38412 +/*
38413 + *     Intel Multiprocessor Specification 1.1 and 1.4
38414 + *     compliant MP-table parsing routines.
38415 + *
38416 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
38417 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
38418 + *
38419 + *     Fixes
38420 + *             Erich Boleyn    :       MP v1.4 and additional changes.
38421 + *             Alan Cox        :       Added EBDA scanning
38422 + *             Ingo Molnar     :       various cleanups and rewrites
38423 + *             Maciej W. Rozycki:      Bits for default MP configurations
38424 + *             Paul Diefenbaugh:       Added full ACPI support
38425 + */
38426 +
38427 +#include <linux/mm.h>
38428 +#include <linux/init.h>
38429 +#include <linux/delay.h>
38430 +#include <linux/bootmem.h>
38431 +#include <linux/smp_lock.h>
38432 +#include <linux/kernel_stat.h>
38433 +#include <linux/mc146818rtc.h>
38434 +#include <linux/acpi.h>
38435 +#include <linux/module.h>
38436 +
38437 +#include <asm/smp.h>
38438 +#include <asm/mtrr.h>
38439 +#include <asm/mpspec.h>
38440 +#include <asm/pgalloc.h>
38441 +#include <asm/io_apic.h>
38442 +#include <asm/proto.h>
38443 +#include <asm/acpi.h>
38444 +
38445 +/* Have we found an MP table */
38446 +int smp_found_config;
38447 +unsigned int __initdata maxcpus = NR_CPUS;
38448 +
38449 +int acpi_found_madt;
38450 +
38451 +/*
38452 + * Various Linux-internal data structures created from the
38453 + * MP-table.
38454 + */
38455 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
38456 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
38457 +
38458 +static int mp_current_pci_id = 0;
38459 +/* I/O APIC entries */
38460 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
38461 +
38462 +/* # of MP IRQ source entries */
38463 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
38464 +
38465 +/* MP IRQ source entries */
38466 +int mp_irq_entries;
38467 +
38468 +int nr_ioapics;
38469 +unsigned long mp_lapic_addr = 0;
38470 +
38471 +
38472 +
38473 +/* Processor that is doing the boot up */
38474 +unsigned int boot_cpu_id = -1U;
38475 +/* Internal processor count */
38476 +unsigned int num_processors __initdata = 0;
38477 +
38478 +unsigned disabled_cpus __initdata;
38479 +
38480 +/* Bitmask of physically existing CPUs */
38481 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
38482 +
38483 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
38484 +
38485 +
38486 +/*
38487 + * Intel MP BIOS table parsing routines:
38488 + */
38489 +
38490 +/*
38491 + * Checksum an MP configuration block.
38492 + */
38493 +
38494 +static int __init mpf_checksum(unsigned char *mp, int len)
38495 +{
38496 +       int sum = 0;
38497 +
38498 +       while (len--)
38499 +               sum += *mp++;
38500 +
38501 +       return sum & 0xFF;
38502 +}
38503 +
38504 +#ifndef CONFIG_XEN
38505 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
38506 +{
38507 +       int cpu;
38508 +       cpumask_t tmp_map;
38509 +       char *bootup_cpu = "";
38510 +
38511 +       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
38512 +               disabled_cpus++;
38513 +               return;
38514 +       }
38515 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
38516 +               bootup_cpu = " (Bootup-CPU)";
38517 +               boot_cpu_id = m->mpc_apicid;
38518 +       }
38519 +
38520 +       printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
38521 +
38522 +       if (num_processors >= NR_CPUS) {
38523 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
38524 +                       " Processor ignored.\n", NR_CPUS);
38525 +               return;
38526 +       }
38527 +
38528 +       num_processors++;
38529 +       cpus_complement(tmp_map, cpu_present_map);
38530 +       cpu = first_cpu(tmp_map);
38531 +
38532 +       physid_set(m->mpc_apicid, phys_cpu_present_map);
38533 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
38534 +               /*
38535 +                * bios_cpu_apicid is required to have processors listed
38536 +                * in same order as logical cpu numbers. Hence the first
38537 +                * entry is BSP, and so on.
38538 +                */
38539 +               cpu = 0;
38540 +       }
38541 +       bios_cpu_apicid[cpu] = m->mpc_apicid;
38542 +       x86_cpu_to_apicid[cpu] = m->mpc_apicid;
38543 +
38544 +       cpu_set(cpu, cpu_possible_map);
38545 +       cpu_set(cpu, cpu_present_map);
38546 +}
38547 +#else
38548 +void __init MP_processor_info (struct mpc_config_processor *m)
38549 +{
38550 +       num_processors++;
38551 +}
38552 +#endif /* CONFIG_XEN */
38553 +
38554 +static void __init MP_bus_info (struct mpc_config_bus *m)
38555 +{
38556 +       char str[7];
38557 +
38558 +       memcpy(str, m->mpc_bustype, 6);
38559 +       str[6] = 0;
38560 +       Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
38561 +
38562 +       if (strncmp(str, "ISA", 3) == 0) {
38563 +               set_bit(m->mpc_busid, mp_bus_not_pci);
38564 +       } else if (strncmp(str, "PCI", 3) == 0) {
38565 +               clear_bit(m->mpc_busid, mp_bus_not_pci);
38566 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
38567 +               mp_current_pci_id++;
38568 +       } else {
38569 +               printk(KERN_ERR "Unknown bustype %s\n", str);
38570 +       }
38571 +}
38572 +
38573 +static int bad_ioapic(unsigned long address)
38574 +{
38575 +       if (nr_ioapics >= MAX_IO_APICS) {
38576 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
38577 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
38578 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
38579 +       }
38580 +       if (!address) {
38581 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
38582 +                       " found in table, skipping!\n");
38583 +               return 1;
38584 +       }
38585 +       return 0;
38586 +}
38587 +
38588 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
38589 +{
38590 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
38591 +               return;
38592 +
38593 +       printk("I/O APIC #%d at 0x%X.\n",
38594 +               m->mpc_apicid, m->mpc_apicaddr);
38595 +
38596 +       if (bad_ioapic(m->mpc_apicaddr))
38597 +               return;
38598 +
38599 +       mp_ioapics[nr_ioapics] = *m;
38600 +       nr_ioapics++;
38601 +}
38602 +
38603 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
38604 +{
38605 +       mp_irqs [mp_irq_entries] = *m;
38606 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
38607 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
38608 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
38609 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
38610 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
38611 +       if (++mp_irq_entries >= MAX_IRQ_SOURCES)
38612 +               panic("Max # of irq sources exceeded!!\n");
38613 +}
38614 +
38615 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
38616 +{
38617 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
38618 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
38619 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
38620 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
38621 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
38622 +}
38623 +
38624 +/*
38625 + * Read/parse the MPC
38626 + */
38627 +
38628 +static int __init smp_read_mpc(struct mp_config_table *mpc)
38629 +{
38630 +       char str[16];
38631 +       int count=sizeof(*mpc);
38632 +       unsigned char *mpt=((unsigned char *)mpc)+count;
38633 +
38634 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
38635 +               printk("MPTABLE: bad signature [%c%c%c%c]!\n",
38636 +                       mpc->mpc_signature[0],
38637 +                       mpc->mpc_signature[1],
38638 +                       mpc->mpc_signature[2],
38639 +                       mpc->mpc_signature[3]);
38640 +               return 0;
38641 +       }
38642 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
38643 +               printk("MPTABLE: checksum error!\n");
38644 +               return 0;
38645 +       }
38646 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
38647 +               printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
38648 +                       mpc->mpc_spec);
38649 +               return 0;
38650 +       }
38651 +       if (!mpc->mpc_lapic) {
38652 +               printk(KERN_ERR "MPTABLE: null local APIC address!\n");
38653 +               return 0;
38654 +       }
38655 +       memcpy(str,mpc->mpc_oem,8);
38656 +       str[8] = 0;
38657 +       printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
38658 +
38659 +       memcpy(str,mpc->mpc_productid,12);
38660 +       str[12] = 0;
38661 +       printk("MPTABLE: Product ID: %s ",str);
38662 +
38663 +       printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
38664 +
38665 +       /* save the local APIC address, it might be non-default */
38666 +       if (!acpi_lapic)
38667 +               mp_lapic_addr = mpc->mpc_lapic;
38668 +
38669 +       /*
38670 +        *      Now process the configuration blocks.
38671 +        */
38672 +       while (count < mpc->mpc_length) {
38673 +               switch(*mpt) {
38674 +                       case MP_PROCESSOR:
38675 +                       {
38676 +                               struct mpc_config_processor *m=
38677 +                                       (struct mpc_config_processor *)mpt;
38678 +                               if (!acpi_lapic)
38679 +                                       MP_processor_info(m);
38680 +                               mpt += sizeof(*m);
38681 +                               count += sizeof(*m);
38682 +                               break;
38683 +                       }
38684 +                       case MP_BUS:
38685 +                       {
38686 +                               struct mpc_config_bus *m=
38687 +                                       (struct mpc_config_bus *)mpt;
38688 +                               MP_bus_info(m);
38689 +                               mpt += sizeof(*m);
38690 +                               count += sizeof(*m);
38691 +                               break;
38692 +                       }
38693 +                       case MP_IOAPIC:
38694 +                       {
38695 +                               struct mpc_config_ioapic *m=
38696 +                                       (struct mpc_config_ioapic *)mpt;
38697 +                               MP_ioapic_info(m);
38698 +                               mpt += sizeof(*m);
38699 +                               count += sizeof(*m);
38700 +                               break;
38701 +                       }
38702 +                       case MP_INTSRC:
38703 +                       {
38704 +                               struct mpc_config_intsrc *m=
38705 +                                       (struct mpc_config_intsrc *)mpt;
38706 +
38707 +                               MP_intsrc_info(m);
38708 +                               mpt += sizeof(*m);
38709 +                               count += sizeof(*m);
38710 +                               break;
38711 +                       }
38712 +                       case MP_LINTSRC:
38713 +                       {
38714 +                               struct mpc_config_lintsrc *m=
38715 +                                       (struct mpc_config_lintsrc *)mpt;
38716 +                               MP_lintsrc_info(m);
38717 +                               mpt += sizeof(*m);
38718 +                               count += sizeof(*m);
38719 +                               break;
38720 +                       }
38721 +               }
38722 +       }
38723 +       clustered_apic_check();
38724 +       if (!num_processors)
38725 +               printk(KERN_ERR "MPTABLE: no processors registered!\n");
38726 +       return num_processors;
38727 +}
38728 +
38729 +static int __init ELCR_trigger(unsigned int irq)
38730 +{
38731 +       unsigned int port;
38732 +
38733 +       port = 0x4d0 + (irq >> 3);
38734 +       return (inb(port) >> (irq & 7)) & 1;
38735 +}
38736 +
38737 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
38738 +{
38739 +       struct mpc_config_intsrc intsrc;
38740 +       int i;
38741 +       int ELCR_fallback = 0;
38742 +
38743 +       intsrc.mpc_type = MP_INTSRC;
38744 +       intsrc.mpc_irqflag = 0;                 /* conforming */
38745 +       intsrc.mpc_srcbus = 0;
38746 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
38747 +
38748 +       intsrc.mpc_irqtype = mp_INT;
38749 +
38750 +       /*
38751 +        *  If true, we have an ISA/PCI system with no IRQ entries
38752 +        *  in the MP table. To prevent the PCI interrupts from being set up
38753 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
38754 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
38755 +        *  never be level sensitive, so we simply see if the ELCR agrees.
38756 +        *  If it does, we assume it's valid.
38757 +        */
38758 +       if (mpc_default_type == 5) {
38759 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
38760 +
38761 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
38762 +                       printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
38763 +               else {
38764 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
38765 +                       ELCR_fallback = 1;
38766 +               }
38767 +       }
38768 +
38769 +       for (i = 0; i < 16; i++) {
38770 +               switch (mpc_default_type) {
38771 +               case 2:
38772 +                       if (i == 0 || i == 13)
38773 +                               continue;       /* IRQ0 & IRQ13 not connected */
38774 +                       /* fall through */
38775 +               default:
38776 +                       if (i == 2)
38777 +                               continue;       /* IRQ2 is never connected */
38778 +               }
38779 +
38780 +               if (ELCR_fallback) {
38781 +                       /*
38782 +                        *  If the ELCR indicates a level-sensitive interrupt, we
38783 +                        *  copy that information over to the MP table in the
38784 +                        *  irqflag field (level sensitive, active high polarity).
38785 +                        */
38786 +                       if (ELCR_trigger(i))
38787 +                               intsrc.mpc_irqflag = 13;
38788 +                       else
38789 +                               intsrc.mpc_irqflag = 0;
38790 +               }
38791 +
38792 +               intsrc.mpc_srcbusirq = i;
38793 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
38794 +               MP_intsrc_info(&intsrc);
38795 +       }
38796 +
38797 +       intsrc.mpc_irqtype = mp_ExtINT;
38798 +       intsrc.mpc_srcbusirq = 0;
38799 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
38800 +       MP_intsrc_info(&intsrc);
38801 +}
38802 +
38803 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
38804 +{
38805 +       struct mpc_config_processor processor;
38806 +       struct mpc_config_bus bus;
38807 +       struct mpc_config_ioapic ioapic;
38808 +       struct mpc_config_lintsrc lintsrc;
38809 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
38810 +       int i;
38811 +
38812 +       /*
38813 +        * local APIC has default address
38814 +        */
38815 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
38816 +
38817 +       /*
38818 +        * 2 CPUs, numbered 0 & 1.
38819 +        */
38820 +       processor.mpc_type = MP_PROCESSOR;
38821 +       processor.mpc_apicver = 0;
38822 +       processor.mpc_cpuflag = CPU_ENABLED;
38823 +       processor.mpc_cpufeature = 0;
38824 +       processor.mpc_featureflag = 0;
38825 +       processor.mpc_reserved[0] = 0;
38826 +       processor.mpc_reserved[1] = 0;
38827 +       for (i = 0; i < 2; i++) {
38828 +               processor.mpc_apicid = i;
38829 +               MP_processor_info(&processor);
38830 +       }
38831 +
38832 +       bus.mpc_type = MP_BUS;
38833 +       bus.mpc_busid = 0;
38834 +       switch (mpc_default_type) {
38835 +               default:
38836 +                       printk(KERN_ERR "???\nUnknown standard configuration %d\n",
38837 +                               mpc_default_type);
38838 +                       /* fall through */
38839 +               case 1:
38840 +               case 5:
38841 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
38842 +                       break;
38843 +       }
38844 +       MP_bus_info(&bus);
38845 +       if (mpc_default_type > 4) {
38846 +               bus.mpc_busid = 1;
38847 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
38848 +               MP_bus_info(&bus);
38849 +       }
38850 +
38851 +       ioapic.mpc_type = MP_IOAPIC;
38852 +       ioapic.mpc_apicid = 2;
38853 +       ioapic.mpc_apicver = 0;
38854 +       ioapic.mpc_flags = MPC_APIC_USABLE;
38855 +       ioapic.mpc_apicaddr = 0xFEC00000;
38856 +       MP_ioapic_info(&ioapic);
38857 +
38858 +       /*
38859 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
38860 +        */
38861 +       construct_default_ioirq_mptable(mpc_default_type);
38862 +
38863 +       lintsrc.mpc_type = MP_LINTSRC;
38864 +       lintsrc.mpc_irqflag = 0;                /* conforming */
38865 +       lintsrc.mpc_srcbusid = 0;
38866 +       lintsrc.mpc_srcbusirq = 0;
38867 +       lintsrc.mpc_destapic = MP_APIC_ALL;
38868 +       for (i = 0; i < 2; i++) {
38869 +               lintsrc.mpc_irqtype = linttypes[i];
38870 +               lintsrc.mpc_destapiclint = i;
38871 +               MP_lintsrc_info(&lintsrc);
38872 +       }
38873 +}
38874 +
38875 +static struct intel_mp_floating *mpf_found;
38876 +
38877 +/*
38878 + * Scan the memory blocks for an SMP configuration block.
38879 + */
38880 +void __init get_smp_config (void)
38881 +{
38882 +       struct intel_mp_floating *mpf = mpf_found;
38883 +
38884 +       /*
38885 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
38886 +        * processors, where MPS only supports physical.
38887 +        */
38888 +       if (acpi_lapic && acpi_ioapic) {
38889 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
38890 +               return;
38891 +       }
38892 +       else if (acpi_lapic)
38893 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
38894 +
38895 +       printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
38896 +
38897 +       /*
38898 +        * Now see if we need to read further.
38899 +        */
38900 +       if (mpf->mpf_feature1 != 0) {
38901 +
38902 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
38903 +               construct_default_ISA_mptable(mpf->mpf_feature1);
38904 +
38905 +       } else if (mpf->mpf_physptr) {
38906 +
38907 +               /*
38908 +                * Read the physical hardware table.  Anything here will
38909 +                * override the defaults.
38910 +                */
38911 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
38912 +                       smp_found_config = 0;
38913 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
38914 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
38915 +                       return;
38916 +               }
38917 +               /*
38918 +                * If there are no explicit MP IRQ entries, then we are
38919 +                * broken.  We set up most of the low 16 IO-APIC pins to
38920 +                * ISA defaults and hope it will work.
38921 +                */
38922 +               if (!mp_irq_entries) {
38923 +                       struct mpc_config_bus bus;
38924 +
38925 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
38926 +
38927 +                       bus.mpc_type = MP_BUS;
38928 +                       bus.mpc_busid = 0;
38929 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
38930 +                       MP_bus_info(&bus);
38931 +
38932 +                       construct_default_ioirq_mptable(0);
38933 +               }
38934 +
38935 +       } else
38936 +               BUG();
38937 +
38938 +       printk(KERN_INFO "Processors: %d\n", num_processors);
38939 +       /*
38940 +        * Only use the first configuration found.
38941 +        */
38942 +}
38943 +
38944 +static int __init smp_scan_config (unsigned long base, unsigned long length)
38945 +{
38946 +       extern void __bad_mpf_size(void); 
38947 +       unsigned int *bp = isa_bus_to_virt(base);
38948 +       struct intel_mp_floating *mpf;
38949 +
38950 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
38951 +       if (sizeof(*mpf) != 16)
38952 +               __bad_mpf_size();
38953 +
38954 +       while (length > 0) {
38955 +               mpf = (struct intel_mp_floating *)bp;
38956 +               if ((*bp == SMP_MAGIC_IDENT) &&
38957 +                       (mpf->mpf_length == 1) &&
38958 +                       !mpf_checksum((unsigned char *)bp, 16) &&
38959 +                       ((mpf->mpf_specification == 1)
38960 +                               || (mpf->mpf_specification == 4)) ) {
38961 +
38962 +                       smp_found_config = 1;
38963 +                       mpf_found = mpf;
38964 +                       return 1;
38965 +               }
38966 +               bp += 4;
38967 +               length -= 16;
38968 +       }
38969 +       return 0;
38970 +}
38971 +
38972 +void __init find_smp_config(void)
38973 +{
38974 +       unsigned int address;
38975 +
38976 +       /*
38977 +        * FIXME: Linux assumes you have 640K of base ram..
38978 +        * this continues the error...
38979 +        *
38980 +        * 1) Scan the bottom 1K for a signature
38981 +        * 2) Scan the top 1K of base RAM
38982 +        * 3) Scan the 64K of bios
38983 +        */
38984 +       if (smp_scan_config(0x0,0x400) ||
38985 +               smp_scan_config(639*0x400,0x400) ||
38986 +                       smp_scan_config(0xF0000,0x10000))
38987 +               return;
38988 +       /*
38989 +        * If it is an SMP machine we should know now.
38990 +        *
38991 +        * there is a real-mode segmented pointer pointing to the
38992 +        * 4K EBDA area at 0x40E, calculate and scan it here.
38993 +        *
38994 +        * NOTE! There are Linux loaders that will corrupt the EBDA
38995 +        * area, and as such this kind of SMP config may be less
38996 +        * trustworthy, simply because the SMP table may have been
38997 +        * stomped on during early boot. These loaders are buggy and
38998 +        * should be fixed.
38999 +        */
39000 +
39001 +       address = *(unsigned short *)phys_to_virt(0x40E);
39002 +       address <<= 4;
39003 +       if (smp_scan_config(address, 0x1000))
39004 +               return;
39005 +
39006 +       /* If we have come this far, we did not find an MP table  */
39007 +        printk(KERN_INFO "No mptable found.\n");
39008 +}
39009 +
39010 +/* --------------------------------------------------------------------------
39011 +                            ACPI-based MP Configuration
39012 +   -------------------------------------------------------------------------- */
39013 +
39014 +#ifdef CONFIG_ACPI
39015 +
39016 +void __init mp_register_lapic_address(u64 address)
39017 +{
39018 +#ifndef CONFIG_XEN
39019 +       mp_lapic_addr = (unsigned long) address;
39020 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
39021 +       if (boot_cpu_id == -1U)
39022 +               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
39023 +#endif
39024 +}
39025 +
39026 +void __cpuinit mp_register_lapic (u8 id, u8 enabled)
39027 +{
39028 +       struct mpc_config_processor processor;
39029 +       int                     boot_cpu = 0;
39030 +       
39031 +       if (id == boot_cpu_id)
39032 +               boot_cpu = 1;
39033 +
39034 +#ifndef CONFIG_XEN
39035 +       processor.mpc_type = MP_PROCESSOR;
39036 +       processor.mpc_apicid = id;
39037 +       processor.mpc_apicver = 0;
39038 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
39039 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
39040 +       processor.mpc_cpufeature = 0;
39041 +       processor.mpc_featureflag = 0;
39042 +       processor.mpc_reserved[0] = 0;
39043 +       processor.mpc_reserved[1] = 0;
39044 +#endif
39045 +
39046 +       MP_processor_info(&processor);
39047 +}
39048 +
39049 +#define MP_ISA_BUS             0
39050 +#define MP_MAX_IOAPIC_PIN      127
39051 +
39052 +static struct mp_ioapic_routing {
39053 +       int                     apic_id;
39054 +       int                     gsi_start;
39055 +       int                     gsi_end;
39056 +       u32                     pin_programmed[4];
39057 +} mp_ioapic_routing[MAX_IO_APICS];
39058 +
39059 +static int mp_find_ioapic(int gsi)
39060 +{
39061 +       int i = 0;
39062 +
39063 +       /* Find the IOAPIC that manages this GSI. */
39064 +       for (i = 0; i < nr_ioapics; i++) {
39065 +               if ((gsi >= mp_ioapic_routing[i].gsi_start)
39066 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
39067 +                       return i;
39068 +       }
39069 +
39070 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
39071 +       return -1;
39072 +}
39073 +
39074 +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
39075 +{
39076 +       int idx = 0;
39077 +
39078 +       if (bad_ioapic(address))
39079 +               return;
39080 +
39081 +       idx = nr_ioapics++;
39082 +
39083 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
39084 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
39085 +       mp_ioapics[idx].mpc_apicaddr = address;
39086 +
39087 +#ifndef CONFIG_XEN
39088 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
39089 +#endif
39090 +       mp_ioapics[idx].mpc_apicid = id;
39091 +       mp_ioapics[idx].mpc_apicver = 0;
39092 +       
39093 +       /* 
39094 +        * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
39095 +        * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
39096 +        */
39097 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
39098 +       mp_ioapic_routing[idx].gsi_start = gsi_base;
39099 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
39100 +               io_apic_get_redir_entries(idx);
39101 +
39102 +       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
39103 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
39104 +               mp_ioapics[idx].mpc_apicaddr,
39105 +               mp_ioapic_routing[idx].gsi_start,
39106 +               mp_ioapic_routing[idx].gsi_end);
39107 +}
39108 +
39109 +void __init
39110 +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32        gsi)
39111 +{
39112 +       struct mpc_config_intsrc intsrc;
39113 +       int                     ioapic = -1;
39114 +       int                     pin = -1;
39115 +
39116 +       /* 
39117 +        * Convert 'gsi' to 'ioapic.pin'.
39118 +        */
39119 +       ioapic = mp_find_ioapic(gsi);
39120 +       if (ioapic < 0)
39121 +               return;
39122 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
39123 +
39124 +       /*
39125 +        * TBD: This check is for faulty timer entries, where the override
39126 +        *      erroneously sets the trigger to level, resulting in a HUGE 
39127 +        *      increase of timer interrupts!
39128 +        */
39129 +       if ((bus_irq == 0) && (trigger == 3))
39130 +               trigger = 1;
39131 +
39132 +       intsrc.mpc_type = MP_INTSRC;
39133 +       intsrc.mpc_irqtype = mp_INT;
39134 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
39135 +       intsrc.mpc_srcbus = MP_ISA_BUS;
39136 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
39137 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
39138 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
39139 +
39140 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
39141 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
39142 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
39143 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
39144 +
39145 +       mp_irqs[mp_irq_entries] = intsrc;
39146 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
39147 +               panic("Max # of irq sources exceeded!\n");
39148 +}
39149 +
39150 +void __init mp_config_acpi_legacy_irqs(void)
39151 +{
39152 +       struct mpc_config_intsrc intsrc;
39153 +       int i = 0;
39154 +       int ioapic = -1;
39155 +
39156 +       /* 
39157 +        * Fabricate the legacy ISA bus (bus #31).
39158 +        */
39159 +       set_bit(MP_ISA_BUS, mp_bus_not_pci);
39160 +
39161 +       /* 
39162 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
39163 +        */
39164 +       ioapic = mp_find_ioapic(0);
39165 +       if (ioapic < 0)
39166 +               return;
39167 +
39168 +       intsrc.mpc_type = MP_INTSRC;
39169 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
39170 +       intsrc.mpc_srcbus = MP_ISA_BUS;
39171 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
39172 +
39173 +       /* 
39174 +        * Use the default configuration for the IRQs 0-15.  Unless
39175 +        * overridden by (MADT) interrupt source override entries.
39176 +        */
39177 +       for (i = 0; i < 16; i++) {
39178 +               int idx;
39179 +
39180 +               for (idx = 0; idx < mp_irq_entries; idx++) {
39181 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
39182 +
39183 +                       /* Do we already have a mapping for this ISA IRQ? */
39184 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
39185 +                               break;
39186 +
39187 +                       /* Do we already have a mapping for this IOAPIC pin */
39188 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
39189 +                               (irq->mpc_dstirq == i))
39190 +                               break;
39191 +               }
39192 +
39193 +               if (idx != mp_irq_entries) {
39194 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
39195 +                       continue;                       /* IRQ already used */
39196 +               }
39197 +
39198 +               intsrc.mpc_irqtype = mp_INT;
39199 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
39200 +               intsrc.mpc_dstirq = i;
39201 +
39202 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
39203 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
39204 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
39205 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
39206 +                       intsrc.mpc_dstirq);
39207 +
39208 +               mp_irqs[mp_irq_entries] = intsrc;
39209 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
39210 +                       panic("Max # of irq sources exceeded!\n");
39211 +       }
39212 +}
39213 +
39214 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
39215 +{
39216 +       int ioapic = -1;
39217 +       int ioapic_pin = 0;
39218 +       int idx, bit = 0;
39219 +
39220 +       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
39221 +               return gsi;
39222 +
39223 +       /* Don't set up the ACPI SCI because it's already set up */
39224 +       if (acpi_fadt.sci_int == gsi)
39225 +               return gsi;
39226 +
39227 +       ioapic = mp_find_ioapic(gsi);
39228 +       if (ioapic < 0) {
39229 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
39230 +               return gsi;
39231 +       }
39232 +
39233 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
39234 +
39235 +       /* 
39236 +        * Avoid pin reprogramming.  PRTs typically include entries  
39237 +        * with redundant pin->gsi mappings (but unique PCI devices);
39238 +        * we only program the IOAPIC on the first.
39239 +        */
39240 +       bit = ioapic_pin % 32;
39241 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
39242 +       if (idx > 3) {
39243 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
39244 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
39245 +                       ioapic_pin);
39246 +               return gsi;
39247 +       }
39248 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
39249 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
39250 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
39251 +               return gsi;
39252 +       }
39253 +
39254 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
39255 +
39256 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
39257 +               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
39258 +               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
39259 +       return gsi;
39260 +}
39261 +#endif /*CONFIG_ACPI*/
39262 diff -ruNp linux-2.6.19/arch/x86_64/kernel/pci-swiotlb-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/pci-swiotlb-xen.c
39263 --- linux-2.6.19/arch/x86_64/kernel/pci-swiotlb-xen.c   1970-01-01 00:00:00.000000000 +0000
39264 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/pci-swiotlb-xen.c 2007-02-02 19:10:26.000000000 +0000
39265 @@ -0,0 +1,55 @@
39266 +/* Glue code to lib/swiotlb.c */
39267 +
39268 +#include <linux/pci.h>
39269 +#include <linux/cache.h>
39270 +#include <linux/module.h>
39271 +#include <asm/dma-mapping.h>
39272 +#include <asm/proto.h>
39273 +#include <asm/swiotlb.h>
39274 +#include <asm/dma.h>
39275 +
39276 +#if 0
39277 +int swiotlb __read_mostly;
39278 +EXPORT_SYMBOL(swiotlb);
39279 +#endif
39280 +
39281 +struct dma_mapping_ops swiotlb_dma_ops = {
39282 +#if 0
39283 +       .mapping_error = swiotlb_dma_mapping_error,
39284 +       .alloc_coherent = swiotlb_alloc_coherent,
39285 +       .free_coherent = swiotlb_free_coherent,
39286 +       .map_single = swiotlb_map_single,
39287 +       .unmap_single = swiotlb_unmap_single,
39288 +       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
39289 +       .sync_single_for_device = swiotlb_sync_single_for_device,
39290 +       .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
39291 +       .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
39292 +       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
39293 +       .sync_sg_for_device = swiotlb_sync_sg_for_device,
39294 +       .map_sg = swiotlb_map_sg,
39295 +       .unmap_sg = swiotlb_unmap_sg,
39296 +       .dma_supported = NULL,
39297 +#endif
39298 +};
39299 +
39300 +void pci_swiotlb_init(void)
39301 +{
39302 +#if 0
39303 +       /* don't initialize swiotlb if iommu=off (no_iommu=1) */
39304 +       if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
39305 +              swiotlb = 1;
39306 +       if (swiotlb_force)
39307 +               swiotlb = 1;
39308 +       if (swiotlb) {
39309 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
39310 +               swiotlb_init();
39311 +               dma_ops = &swiotlb_dma_ops;
39312 +       }
39313 +#else
39314 +       swiotlb_init();
39315 +       if (swiotlb) {
39316 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
39317 +               dma_ops = &swiotlb_dma_ops;
39318 +       }
39319 +#endif
39320 +}
39321 diff -ruNp linux-2.6.19/arch/x86_64/kernel/process-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/process-xen.c
39322 --- linux-2.6.19/arch/x86_64/kernel/process-xen.c       1970-01-01 00:00:00.000000000 +0000
39323 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/process-xen.c     2007-02-02 19:10:26.000000000 +0000
39324 @@ -0,0 +1,859 @@
39325 +/*
39326 + *  linux/arch/x86-64/kernel/process.c
39327 + *
39328 + *  Copyright (C) 1995  Linus Torvalds
39329 + *
39330 + *  Pentium III FXSR, SSE support
39331 + *     Gareth Hughes <gareth@valinux.com>, May 2000
39332 + * 
39333 + *  X86-64 port
39334 + *     Andi Kleen.
39335 + *
39336 + *     CPU hotplug support - ashok.raj@intel.com
39337 + * 
39338 + *  Jun Nakajima <jun.nakajima@intel.com> 
39339 + *     Modified for Xen
39340 + */
39341 +
39342 +/*
39343 + * This file handles the architecture-dependent parts of process handling..
39344 + */
39345 +
39346 +#include <stdarg.h>
39347 +
39348 +#include <linux/cpu.h>
39349 +#include <linux/errno.h>
39350 +#include <linux/sched.h>
39351 +#include <linux/kernel.h>
39352 +#include <linux/mm.h>
39353 +#include <linux/elfcore.h>
39354 +#include <linux/smp.h>
39355 +#include <linux/slab.h>
39356 +#include <linux/user.h>
39357 +#include <linux/module.h>
39358 +#include <linux/a.out.h>
39359 +#include <linux/interrupt.h>
39360 +#include <linux/delay.h>
39361 +#include <linux/ptrace.h>
39362 +#include <linux/utsname.h>
39363 +#include <linux/random.h>
39364 +#include <linux/notifier.h>
39365 +#include <linux/kprobes.h>
39366 +
39367 +#include <asm/uaccess.h>
39368 +#include <asm/pgtable.h>
39369 +#include <asm/system.h>
39370 +#include <asm/io.h>
39371 +#include <asm/processor.h>
39372 +#include <asm/i387.h>
39373 +#include <asm/mmu_context.h>
39374 +#include <asm/pda.h>
39375 +#include <asm/prctl.h>
39376 +#include <asm/kdebug.h>
39377 +#include <xen/interface/dom0_ops.h>
39378 +#include <xen/interface/physdev.h>
39379 +#include <xen/interface/vcpu.h>
39380 +#include <asm/desc.h>
39381 +#include <asm/proto.h>
39382 +#include <asm/hardirq.h>
39383 +#include <asm/ia32.h>
39384 +#include <asm/idle.h>
39385 +
39386 +#include <xen/cpu_hotplug.h>
39387 +
39388 +asmlinkage extern void ret_from_fork(void);
39389 +
39390 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
39391 +
39392 +unsigned long boot_option_idle_override = 0;
39393 +EXPORT_SYMBOL(boot_option_idle_override);
39394 +
39395 +/*
39396 + * Powermanagement idle function, if any..
39397 + */
39398 +void (*pm_idle)(void);
39399 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
39400 +
39401 +static ATOMIC_NOTIFIER_HEAD(idle_notifier);
39402 +
39403 +void idle_notifier_register(struct notifier_block *n)
39404 +{
39405 +       atomic_notifier_chain_register(&idle_notifier, n);
39406 +}
39407 +EXPORT_SYMBOL_GPL(idle_notifier_register);
39408 +
39409 +void idle_notifier_unregister(struct notifier_block *n)
39410 +{
39411 +       atomic_notifier_chain_unregister(&idle_notifier, n);
39412 +}
39413 +EXPORT_SYMBOL(idle_notifier_unregister);
39414 +
39415 +void enter_idle(void)
39416 +{
39417 +       write_pda(isidle, 1);
39418 +       atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
39419 +}
39420 +
39421 +static void __exit_idle(void)
39422 +{
39423 +       if (test_and_clear_bit_pda(0, isidle) == 0)
39424 +               return;
39425 +       atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
39426 +}
39427 +
39428 +/* Called from interrupts to signify idle end */
39429 +void exit_idle(void)
39430 +{
39431 +       /* idle loop has pid 0 */
39432 +       if (current->pid)
39433 +               return;
39434 +       __exit_idle();
39435 +}
39436 +
39437 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
39438 +void xen_idle(void)
39439 +{
39440 +       local_irq_disable();
39441 +
39442 +       if (need_resched())
39443 +               local_irq_enable();
39444 +       else {
39445 +               current_thread_info()->status &= ~TS_POLLING;
39446 +               /*
39447 +                * TS_POLLING-cleared state must be visible before we
39448 +                * test NEED_RESCHED:
39449 +                */
39450 +               smp_mb();
39451 +               safe_halt();
39452 +               current_thread_info()->status |= TS_POLLING;
39453 +       }
39454 +}
39455 +
39456 +#ifdef CONFIG_HOTPLUG_CPU
39457 +static inline void play_dead(void)
39458 +{
39459 +       idle_task_exit();
39460 +       local_irq_disable();
39461 +       cpu_clear(smp_processor_id(), cpu_initialized);
39462 +       preempt_enable_no_resched();
39463 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
39464 +       cpu_bringup();
39465 +}
39466 +#else
39467 +static inline void play_dead(void)
39468 +{
39469 +       BUG();
39470 +}
39471 +#endif /* CONFIG_HOTPLUG_CPU */
39472 +
39473 +/*
39474 + * The idle thread. There's no useful work to be
39475 + * done, so just try to conserve power and have a
39476 + * low exit latency (ie sit in a loop waiting for
39477 + * somebody to say that they'd like to reschedule)
39478 + */
39479 +void cpu_idle (void)
39480 +{
39481 +       current_thread_info()->status |= TS_POLLING;
39482 +       /* endless idle loop with no priority at all */
39483 +       while (1) {
39484 +               while (!need_resched()) {
39485 +                       if (__get_cpu_var(cpu_idle_state))
39486 +                               __get_cpu_var(cpu_idle_state) = 0;
39487 +
39488 +                       rmb();
39489 +                       if (cpu_is_offline(smp_processor_id()))
39490 +                               play_dead();
39491 +                       enter_idle();
39492 +                       xen_idle();
39493 +                       /* In many cases the interrupt that ended idle
39494 +                          has already called exit_idle. But some idle
39495 +                          loops can be woken up without interrupt. */
39496 +                       __exit_idle();
39497 +               }
39498 +
39499 +               preempt_enable_no_resched();
39500 +               schedule();
39501 +               preempt_disable();
39502 +       }
39503 +}
39504 +
39505 +void cpu_idle_wait(void)
39506 +{
39507 +       unsigned int cpu, this_cpu = get_cpu();
39508 +       cpumask_t map, tmp = current->cpus_allowed;
39509 +
39510 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
39511 +       put_cpu();
39512 +
39513 +       cpus_clear(map);
39514 +       for_each_online_cpu(cpu) {
39515 +               per_cpu(cpu_idle_state, cpu) = 1;
39516 +               cpu_set(cpu, map);
39517 +       }
39518 +
39519 +       __get_cpu_var(cpu_idle_state) = 0;
39520 +
39521 +       wmb();
39522 +       do {
39523 +               ssleep(1);
39524 +               for_each_online_cpu(cpu) {
39525 +                       if (cpu_isset(cpu, map) &&
39526 +                                       !per_cpu(cpu_idle_state, cpu))
39527 +                               cpu_clear(cpu, map);
39528 +               }
39529 +               cpus_and(map, map, cpu_online_map);
39530 +       } while (!cpus_empty(map));
39531 +
39532 +       set_cpus_allowed(current, tmp);
39533 +}
39534 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
39535 +
39536 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
39537 +/* Always use xen_idle() instead. */
39538 +void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) {}
39539 +
39540 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
39541 +
39542 +/* Prints also some state that isn't saved in the pt_regs */ 
39543 +void __show_regs(struct pt_regs * regs)
39544 +{
39545 +       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
39546 +       unsigned int fsindex,gsindex;
39547 +       unsigned int ds,cs,es; 
39548 +
39549 +       printk("\n");
39550 +       print_modules();
39551 +       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
39552 +               current->pid, current->comm, print_tainted(),
39553 +               init_utsname()->release,
39554 +               (int)strcspn(init_utsname()->version, " "),
39555 +               init_utsname()->version);
39556 +       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
39557 +       printk_address(regs->rip); 
39558 +       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
39559 +               regs->eflags);
39560 +       printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
39561 +              regs->rax, regs->rbx, regs->rcx);
39562 +       printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
39563 +              regs->rdx, regs->rsi, regs->rdi); 
39564 +       printk("RBP: %016lx R08: %016lx R09: %016lx\n",
39565 +              regs->rbp, regs->r8, regs->r9); 
39566 +       printk("R10: %016lx R11: %016lx R12: %016lx\n",
39567 +              regs->r10, regs->r11, regs->r12); 
39568 +       printk("R13: %016lx R14: %016lx R15: %016lx\n",
39569 +              regs->r13, regs->r14, regs->r15); 
39570 +
39571 +       asm("movl %%ds,%0" : "=r" (ds)); 
39572 +       asm("movl %%cs,%0" : "=r" (cs)); 
39573 +       asm("movl %%es,%0" : "=r" (es)); 
39574 +       asm("movl %%fs,%0" : "=r" (fsindex));
39575 +       asm("movl %%gs,%0" : "=r" (gsindex));
39576 +
39577 +       rdmsrl(MSR_FS_BASE, fs);
39578 +       rdmsrl(MSR_GS_BASE, gs); 
39579 +       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
39580 +
39581 +       cr0 = read_cr0();
39582 +       cr2 = 0; /* No real clue how to read it. JQ */
39583 +       cr3 = read_cr3();
39584 +       cr4 = read_cr4();
39585 +
39586 +       printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
39587 +              fs,fsindex,gs,gsindex,shadowgs); 
39588 +       printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
39589 +       printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
39590 +}
39591 +
39592 +void show_regs(struct pt_regs *regs)
39593 +{
39594 +       printk("CPU %d:", smp_processor_id());
39595 +       __show_regs(regs);
39596 +       show_trace(NULL, regs, &regs->rsp);
39597 +}
39598 +
39599 +/*
39600 + * Free current thread data structures etc..
39601 + */
39602 +void exit_thread(void)
39603 +{
39604 +       struct task_struct *me = current;
39605 +       struct thread_struct *t = &me->thread;
39606 +
39607 +       if (me->thread.io_bitmap_ptr) { 
39608 +#ifndef CONFIG_X86_NO_TSS
39609 +               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
39610 +#endif
39611 +#ifdef CONFIG_XEN
39612 +               struct physdev_set_iobitmap iobmp_op = { 0 };
39613 +#endif
39614 +
39615 +               kfree(t->io_bitmap_ptr);
39616 +               t->io_bitmap_ptr = NULL;
39617 +               clear_thread_flag(TIF_IO_BITMAP);
39618 +               /*
39619 +                * Careful, clear this in the TSS too:
39620 +                */
39621 +#ifndef CONFIG_X86_NO_TSS
39622 +               memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
39623 +               put_cpu();
39624 +#endif
39625 +#ifdef CONFIG_XEN
39626 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobmp_op);
39627 +#endif
39628 +               t->io_bitmap_max = 0;
39629 +       }
39630 +}
39631 +
39632 +void load_gs_index(unsigned gs)
39633 +{
39634 +       HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
39635 +}
39636 +
39637 +void flush_thread(void)
39638 +{
39639 +       struct task_struct *tsk = current;
39640 +       struct thread_info *t = current_thread_info();
39641 +
39642 +       if (t->flags & _TIF_ABI_PENDING) {
39643 +               t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
39644 +               if (t->flags & _TIF_IA32)
39645 +                       current_thread_info()->status |= TS_COMPAT;
39646 +       }
39647 +       t->flags &= ~_TIF_DEBUG;
39648 +
39649 +       tsk->thread.debugreg0 = 0;
39650 +       tsk->thread.debugreg1 = 0;
39651 +       tsk->thread.debugreg2 = 0;
39652 +       tsk->thread.debugreg3 = 0;
39653 +       tsk->thread.debugreg6 = 0;
39654 +       tsk->thread.debugreg7 = 0;
39655 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
39656 +       /*
39657 +        * Forget coprocessor state..
39658 +        */
39659 +       clear_fpu(tsk);
39660 +       clear_used_math();
39661 +}
39662 +
39663 +void release_thread(struct task_struct *dead_task)
39664 +{
39665 +       if (dead_task->mm) {
39666 +               if (dead_task->mm->context.size) {
39667 +                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
39668 +                                       dead_task->comm,
39669 +                                       dead_task->mm->context.ldt,
39670 +                                       dead_task->mm->context.size);
39671 +                       BUG();
39672 +               }
39673 +       }
39674 +}
39675 +
39676 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
39677 +{
39678 +       struct user_desc ud = { 
39679 +               .base_addr = addr,
39680 +               .limit = 0xfffff,
39681 +               .seg_32bit = 1,
39682 +               .limit_in_pages = 1,
39683 +               .useable = 1,
39684 +       };
39685 +       struct n_desc_struct *desc = (void *)t->thread.tls_array;
39686 +       desc += tls;
39687 +       desc->a = LDT_entry_a(&ud); 
39688 +       desc->b = LDT_entry_b(&ud); 
39689 +}
39690 +
39691 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
39692 +{
39693 +       struct desc_struct *desc = (void *)t->thread.tls_array;
39694 +       desc += tls;
39695 +       return desc->base0 | 
39696 +               (((u32)desc->base1) << 16) | 
39697 +               (((u32)desc->base2) << 24);
39698 +}
39699 +
39700 +/*
39701 + * This gets called before we allocate a new thread and copy
39702 + * the current task into it.
39703 + */
39704 +void prepare_to_copy(struct task_struct *tsk)
39705 +{
39706 +       unlazy_fpu(tsk);
39707 +}
39708 +
39709 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
39710 +               unsigned long unused,
39711 +       struct task_struct * p, struct pt_regs * regs)
39712 +{
39713 +       int err;
39714 +       struct pt_regs * childregs;
39715 +       struct task_struct *me = current;
39716 +
39717 +       childregs = ((struct pt_regs *)
39718 +                       (THREAD_SIZE + task_stack_page(p))) - 1;
39719 +       *childregs = *regs;
39720 +
39721 +       childregs->rax = 0;
39722 +       childregs->rsp = rsp;
39723 +       if (rsp == ~0UL)
39724 +               childregs->rsp = (unsigned long)childregs;
39725 +
39726 +       p->thread.rsp = (unsigned long) childregs;
39727 +       p->thread.rsp0 = (unsigned long) (childregs+1);
39728 +       p->thread.userrsp = me->thread.userrsp; 
39729 +
39730 +       set_tsk_thread_flag(p, TIF_FORK);
39731 +
39732 +       p->thread.fs = me->thread.fs;
39733 +       p->thread.gs = me->thread.gs;
39734 +
39735 +       asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
39736 +       asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
39737 +       asm("mov %%es,%0" : "=m" (p->thread.es));
39738 +       asm("mov %%ds,%0" : "=m" (p->thread.ds));
39739 +
39740 +       if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
39741 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
39742 +               if (!p->thread.io_bitmap_ptr) {
39743 +                       p->thread.io_bitmap_max = 0;
39744 +                       return -ENOMEM;
39745 +               }
39746 +               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
39747 +                               IO_BITMAP_BYTES);
39748 +               set_tsk_thread_flag(p, TIF_IO_BITMAP);
39749 +       } 
39750 +
39751 +       /*
39752 +        * Set a new TLS for the child thread?
39753 +        */
39754 +       if (clone_flags & CLONE_SETTLS) {
39755 +#ifdef CONFIG_IA32_EMULATION
39756 +               if (test_thread_flag(TIF_IA32))
39757 +                       err = ia32_child_tls(p, childregs); 
39758 +               else                    
39759 +#endif  
39760 +                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
39761 +               if (err) 
39762 +                       goto out;
39763 +       }
39764 +        p->thread.iopl = current->thread.iopl;
39765 +
39766 +       err = 0;
39767 +out:
39768 +       if (err && p->thread.io_bitmap_ptr) {
39769 +               kfree(p->thread.io_bitmap_ptr);
39770 +               p->thread.io_bitmap_max = 0;
39771 +       }
39772 +       return err;
39773 +}
39774 +
39775 +static inline void __save_init_fpu( struct task_struct *tsk )
39776 +{
39777 +       asm volatile( "rex64 ; fxsave %0 ; fnclex"
39778 +                     : "=m" (tsk->thread.i387.fxsave));
39779 +       tsk->thread_info->status &= ~TS_USEDFPU;
39780 +}
39781 +
39782 +/*
39783 + * This special macro can be used to load a debugging register
39784 + */
39785 +#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
39786 +
39787 +static inline void __switch_to_xtra(struct task_struct *prev_p,
39788 +                                   struct task_struct *next_p)
39789 +{
39790 +       struct thread_struct *prev, *next;
39791 +
39792 +       prev = &prev_p->thread,
39793 +       next = &next_p->thread;
39794 +
39795 +       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
39796 +               loaddebug(next, 0);
39797 +               loaddebug(next, 1);
39798 +               loaddebug(next, 2);
39799 +               loaddebug(next, 3);
39800 +               /* no 4 and 5 */
39801 +               loaddebug(next, 6);
39802 +               loaddebug(next, 7);
39803 +       }
39804 +#ifndef CONFIG_XEN
39805 +       if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
39806 +               /*
39807 +                * Copy the relevant range of the IO bitmap.
39808 +                * Normally this is 128 bytes or less:
39809 +                */
39810 +               memcpy(tss->io_bitmap, next->io_bitmap_ptr,
39811 +                      max(prev->io_bitmap_max, next->io_bitmap_max));
39812 +       } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
39813 +               /*
39814 +                * Clear any possible leftover bits:
39815 +                */
39816 +               memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
39817 +       }
39818 +#endif
39819 +}
39820 +
39821 +/*
39822 + *     switch_to(x,y) should switch tasks from x to y.
39823 + *
39824 + * This could still be optimized: 
39825 + * - fold all the options into a flag word and test it with a single test.
39826 + * - could test fs/gs bitsliced
39827 + *
39828 + * Kprobes not supported here. Set the probe on schedule instead.
39829 + */
39830 +__kprobes struct task_struct *
39831 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
39832 +{
39833 +       struct thread_struct *prev = &prev_p->thread,
39834 +                                *next = &next_p->thread;
39835 +       int cpu = smp_processor_id();  
39836 +#ifndef CONFIG_X86_NO_TSS
39837 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
39838 +#endif
39839 +       struct physdev_set_iopl iopl_op;
39840 +       struct physdev_set_iobitmap iobmp_op;
39841 +       multicall_entry_t _mcl[8], *mcl = _mcl;
39842 +
39843 +       /*
39844 +        * Reload esp0, LDT and the page table pointer:
39845 +        */
39846 +       mcl->op      = __HYPERVISOR_stack_switch;
39847 +       mcl->args[0] = __KERNEL_DS;
39848 +       mcl->args[1] = next->rsp0;
39849 +       mcl++;
39850 +
39851 +       /* we're going to use this soon, after a few expensive things */
39852 +       if (next_p->fpu_counter>5)
39853 +               prefetch(&next->i387.fxsave);
39854 +
39855 +       /*
39856 +        * Load the per-thread Thread-Local Storage descriptor.
39857 +        * This is load_TLS(next, cpu) with multicalls.
39858 +        */
39859 +#define C(i) do {                                                      \
39860 +       if (unlikely(next->tls_array[i] != prev->tls_array[i])) {       \
39861 +               mcl->op      = __HYPERVISOR_update_descriptor;          \
39862 +               mcl->args[0] = virt_to_machine(                         \
39863 +                       &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);          \
39864 +               mcl->args[1] = next->tls_array[i];                      \
39865 +               mcl++;                                                  \
39866 +       }                                                               \
39867 +} while (0)
39868 +       C(0); C(1); C(2);
39869 +#undef C
39870 +
39871 +       if (unlikely(prev->iopl != next->iopl)) {
39872 +               iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
39873 +               mcl->op      = __HYPERVISOR_physdev_op;
39874 +               mcl->args[0] = PHYSDEVOP_set_iopl;
39875 +               mcl->args[1] = (unsigned long)&iopl_op;
39876 +               mcl++;
39877 +       }
39878 +
39879 +       if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP) ||
39880 +           test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
39881 +               iobmp_op.bitmap   = (char *)next->io_bitmap_ptr;
39882 +               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
39883 +               mcl->op      = __HYPERVISOR_physdev_op;
39884 +               mcl->args[0] = PHYSDEVOP_set_iobitmap;
39885 +               mcl->args[1] = (unsigned long)&iobmp_op;
39886 +               mcl++;
39887 +       }
39888 +
39889 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
39890 +       /* 
39891 +        * Switch DS and ES.
39892 +        * This won't pick up thread selector changes, but I guess that is ok.
39893 +        */
39894 +       if (unlikely(next->es))
39895 +               loadsegment(es, next->es); 
39896 +       
39897 +       if (unlikely(next->ds))
39898 +               loadsegment(ds, next->ds);
39899 +
39900 +       /* 
39901 +        * Switch FS and GS.
39902 +        */
39903 +       if (unlikely(next->fsindex))
39904 +               loadsegment(fs, next->fsindex);
39905 +
39906 +       if (next->fs)
39907 +               HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs); 
39908 +       
39909 +       if (unlikely(next->gsindex))
39910 +               load_gs_index(next->gsindex);
39911 +
39912 +       if (next->gs)
39913 +               HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs); 
39914 +
39915 +       /* Must be after DS reload */
39916 +       /*
39917 +        * This is basically '__unlazy_fpu'
39918 +        */
39919 +       if (prev_p->thread_info->status & TS_USEDFPU) {
39920 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
39921 +               HYPERVISOR_fpu_taskswitch(1);
39922 +       }
39923 +
39924 +       /* 
39925 +        * Switch the PDA and FPU contexts.
39926 +        */
39927 +       prev->userrsp = read_pda(oldrsp); 
39928 +       write_pda(oldrsp, next->userrsp); 
39929 +       write_pda(pcurrent, next_p); 
39930 +
39931 +       write_pda(kernelstack,
39932 +       (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
39933 +#ifdef CONFIG_CC_STACKPROTECTOR
39934 +       write_pda(stack_canary, next_p->stack_canary);
39935 +       /*
39936 +        * Build time only check to make sure the stack_canary is at
39937 +        * offset 40 in the pda; this is a gcc ABI requirement
39938 +        */
39939 +       BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
39940 +#endif
39941 +
39942 +       /*
39943 +        * Now maybe reload the debug registers and handle I/O bitmaps
39944 +        */
39945 +       if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
39946 +           || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
39947 +               __switch_to_xtra(prev_p, next_p);
39948 +
39949 +       /* If the task has used fpu the last 5 timeslices, just do a full
39950 +        * restore of the math state immediately to avoid the trap; the
39951 +        * chances of needing FPU soon are obviously high now
39952 +        */
39953 +       if (next_p->fpu_counter>5)
39954 +               math_state_restore();
39955 +       return prev_p;
39956 +}
39957 +
39958 +/*
39959 + * sys_execve() executes a new program.
39960 + */
39961 +asmlinkage 
39962 +long sys_execve(char __user *name, char __user * __user *argv,
39963 +               char __user * __user *envp, struct pt_regs regs)
39964 +{
39965 +       long error;
39966 +       char * filename;
39967 +
39968 +       filename = getname(name);
39969 +       error = PTR_ERR(filename);
39970 +       if (IS_ERR(filename)) 
39971 +               return error;
39972 +       error = do_execve(filename, argv, envp, &regs); 
39973 +       if (error == 0) {
39974 +               task_lock(current);
39975 +               current->ptrace &= ~PT_DTRACE;
39976 +               task_unlock(current);
39977 +       }
39978 +       putname(filename);
39979 +       return error;
39980 +}
39981 +
39982 +void set_personality_64bit(void)
39983 +{
39984 +       /* inherit personality from parent */
39985 +
39986 +       /* Make sure to be in 64bit mode */
39987 +       clear_thread_flag(TIF_IA32); 
39988 +
39989 +       /* TBD: overwrites user setup. Should have two bits.
39990 +          But 64bit processes have always behaved this way,
39991 +          so it's not too bad. The main problem is just that
39992 +          32bit childs are affected again. */
39993 +       current->personality &= ~READ_IMPLIES_EXEC;
39994 +}
39995 +
39996 +asmlinkage long sys_fork(struct pt_regs *regs)
39997 +{
39998 +       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
39999 +}
40000 +
40001 +asmlinkage long
40002 +sys_clone(unsigned long clone_flags, unsigned long newsp,
40003 +         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
40004 +{
40005 +       if (!newsp)
40006 +               newsp = regs->rsp;
40007 +       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
40008 +}
40009 +
40010 +/*
40011 + * This is trivial, and on the face of it looks like it
40012 + * could equally well be done in user mode.
40013 + *
40014 + * Not so, for quite unobvious reasons - register pressure.
40015 + * In user mode vfork() cannot have a stack frame, and if
40016 + * done by calling the "clone()" system call directly, you
40017 + * do not have enough call-clobbered registers to hold all
40018 + * the information you need.
40019 + */
40020 +asmlinkage long sys_vfork(struct pt_regs *regs)
40021 +{
40022 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
40023 +                   NULL, NULL);
40024 +}
40025 +
40026 +unsigned long get_wchan(struct task_struct *p)
40027 +{
40028 +       unsigned long stack;
40029 +       u64 fp,rip;
40030 +       int count = 0;
40031 +
40032 +       if (!p || p == current || p->state==TASK_RUNNING)
40033 +               return 0; 
40034 +       stack = (unsigned long)task_stack_page(p);
40035 +       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
40036 +               return 0;
40037 +       fp = *(u64 *)(p->thread.rsp);
40038 +       do { 
40039 +               if (fp < (unsigned long)stack ||
40040 +                   fp > (unsigned long)stack+THREAD_SIZE)
40041 +                       return 0; 
40042 +               rip = *(u64 *)(fp+8); 
40043 +               if (!in_sched_functions(rip))
40044 +                       return rip; 
40045 +               fp = *(u64 *)fp; 
40046 +       } while (count++ < 16); 
40047 +       return 0;
40048 +}
40049 +
40050 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
40051 +{ 
40052 +       int ret = 0; 
40053 +       int doit = task == current;
40054 +       int cpu;
40055 +
40056 +       switch (code) { 
40057 +       case ARCH_SET_GS:
40058 +               if (addr >= TASK_SIZE_OF(task))
40059 +                       return -EPERM; 
40060 +               cpu = get_cpu();
40061 +               /* handle small bases via the GDT because that's faster to 
40062 +                  switch. */
40063 +               if (addr <= 0xffffffff) {  
40064 +                       set_32bit_tls(task, GS_TLS, addr); 
40065 +                       if (doit) { 
40066 +                               load_TLS(&task->thread, cpu);
40067 +                               load_gs_index(GS_TLS_SEL); 
40068 +                       }
40069 +                       task->thread.gsindex = GS_TLS_SEL; 
40070 +                       task->thread.gs = 0;
40071 +               } else { 
40072 +                       task->thread.gsindex = 0;
40073 +                       task->thread.gs = addr;
40074 +                       if (doit) {
40075 +                               load_gs_index(0);
40076 +                               ret = HYPERVISOR_set_segment_base(
40077 +                                       SEGBASE_GS_USER, addr);
40078 +                       } 
40079 +               }
40080 +               put_cpu();
40081 +               break;
40082 +       case ARCH_SET_FS:
40083 +               /* Not strictly needed for fs, but do it for symmetry
40084 +                  with gs */
40085 +               if (addr >= TASK_SIZE_OF(task))
40086 +                       return -EPERM; 
40087 +               cpu = get_cpu();
40088 +               /* handle small bases via the GDT because that's faster to 
40089 +                  switch. */
40090 +               if (addr <= 0xffffffff) { 
40091 +                       set_32bit_tls(task, FS_TLS, addr);
40092 +                       if (doit) { 
40093 +                               load_TLS(&task->thread, cpu); 
40094 +                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
40095 +                       }
40096 +                       task->thread.fsindex = FS_TLS_SEL;
40097 +                       task->thread.fs = 0;
40098 +               } else { 
40099 +                       task->thread.fsindex = 0;
40100 +                       task->thread.fs = addr;
40101 +                       if (doit) {
40102 +                               /* set the selector to 0 to not confuse
40103 +                                  __switch_to */
40104 +                               asm volatile("movl %0,%%fs" :: "r" (0));
40105 +                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
40106 +                                                                 addr);
40107 +                       }
40108 +               }
40109 +               put_cpu();
40110 +               break;
40111 +       case ARCH_GET_FS: { 
40112 +               unsigned long base; 
40113 +               if (task->thread.fsindex == FS_TLS_SEL)
40114 +                       base = read_32bit_tls(task, FS_TLS);
40115 +               else if (doit)
40116 +                       rdmsrl(MSR_FS_BASE, base);
40117 +               else
40118 +                       base = task->thread.fs;
40119 +               ret = put_user(base, (unsigned long __user *)addr); 
40120 +               break; 
40121 +       }
40122 +       case ARCH_GET_GS: { 
40123 +               unsigned long base;
40124 +               unsigned gsindex;
40125 +               if (task->thread.gsindex == GS_TLS_SEL)
40126 +                       base = read_32bit_tls(task, GS_TLS);
40127 +               else if (doit) {
40128 +                       asm("movl %%gs,%0" : "=r" (gsindex));
40129 +                       if (gsindex)
40130 +                               rdmsrl(MSR_KERNEL_GS_BASE, base);
40131 +                       else
40132 +                               base = task->thread.gs;
40133 +               }
40134 +               else
40135 +                       base = task->thread.gs;
40136 +               ret = put_user(base, (unsigned long __user *)addr); 
40137 +               break;
40138 +       }
40139 +
40140 +       default:
40141 +               ret = -EINVAL;
40142 +               break;
40143 +       } 
40144 +
40145 +       return ret;     
40146 +} 
40147 +
40148 +long sys_arch_prctl(int code, unsigned long addr)
40149 +{
40150 +       return do_arch_prctl(current, code, addr);
40151 +} 
40152 +
40153 +/* 
40154 + * Capture the user space registers if the task is not running (in user space)
40155 + */
40156 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
40157 +{
40158 +       struct pt_regs *pp, ptregs;
40159 +
40160 +       pp = task_pt_regs(tsk);
40161 +
40162 +       ptregs = *pp; 
40163 +       ptregs.cs &= 0xffff;
40164 +       ptregs.ss &= 0xffff;
40165 +
40166 +       elf_core_copy_regs(regs, &ptregs);
40167
40168 +        boot_option_idle_override = 1;
40169 +       return 1;
40170 +}
40171 +
40172 +unsigned long arch_align_stack(unsigned long sp)
40173 +{
40174 +       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
40175 +               sp -= get_random_int() % 8192;
40176 +       return sp & ~0xf;
40177 +}
40178 +
40179 +#ifndef CONFIG_SMP
40180 +void _restore_vcpu(void)
40181 +{
40182 +}
40183 +#endif
40184 diff -ruNp linux-2.6.19/arch/x86_64/kernel/setup-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/setup-xen.c
40185 --- linux-2.6.19/arch/x86_64/kernel/setup-xen.c 1970-01-01 00:00:00.000000000 +0000
40186 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/setup-xen.c       2007-02-02 19:10:26.000000000 +0000
40187 @@ -0,0 +1,1482 @@
40188 +/*
40189 + *  linux/arch/x86-64/kernel/setup.c
40190 + *
40191 + *  Copyright (C) 1995  Linus Torvalds
40192 + *
40193 + *  Nov 2001 Dave Jones <davej@suse.de>
40194 + *  Forked from i386 setup code.
40195 + */
40196 +
40197 +/*
40198 + * This file handles the architecture-dependent parts of initialization
40199 + */
40200 +
40201 +#include <linux/errno.h>
40202 +#include <linux/sched.h>
40203 +#include <linux/kernel.h>
40204 +#include <linux/mm.h>
40205 +#include <linux/stddef.h>
40206 +#include <linux/unistd.h>
40207 +#include <linux/ptrace.h>
40208 +#include <linux/slab.h>
40209 +#include <linux/user.h>
40210 +#include <linux/a.out.h>
40211 +#include <linux/screen_info.h>
40212 +#include <linux/ioport.h>
40213 +#include <linux/delay.h>
40214 +#include <linux/init.h>
40215 +#include <linux/initrd.h>
40216 +#include <linux/highmem.h>
40217 +#include <linux/bootmem.h>
40218 +#include <linux/module.h>
40219 +#include <asm/processor.h>
40220 +#include <linux/console.h>
40221 +#include <linux/seq_file.h>
40222 +#include <linux/crash_dump.h>
40223 +#include <linux/root_dev.h>
40224 +#include <linux/pci.h>
40225 +#include <linux/acpi.h>
40226 +#include <linux/kallsyms.h>
40227 +#include <linux/edd.h>
40228 +#include <linux/mmzone.h>
40229 +#include <linux/kexec.h>
40230 +#include <linux/cpufreq.h>
40231 +#include <linux/dmi.h>
40232 +#include <linux/dma-mapping.h>
40233 +#include <linux/ctype.h>
40234 +
40235 +#include <asm/mtrr.h>
40236 +#include <asm/uaccess.h>
40237 +#include <asm/system.h>
40238 +#include <asm/io.h>
40239 +#include <asm/smp.h>
40240 +#include <asm/msr.h>
40241 +#include <asm/desc.h>
40242 +#include <video/edid.h>
40243 +#include <asm/e820.h>
40244 +#include <asm/dma.h>
40245 +#include <asm/mpspec.h>
40246 +#include <asm/mmu_context.h>
40247 +#include <asm/bootsetup.h>
40248 +#include <asm/proto.h>
40249 +#include <asm/setup.h>
40250 +#include <asm/mach_apic.h>
40251 +#include <asm/numa.h>
40252 +#include <asm/sections.h>
40253 +#include <asm/dmi.h>
40254 +#ifdef CONFIG_XEN
40255 +#include <linux/percpu.h>
40256 +#include <linux/pfn.h>
40257 +#include <xen/interface/physdev.h>
40258 +#include "setup_arch_pre.h"
40259 +#include <asm/hypervisor.h>
40260 +#include <xen/interface/nmi.h>
40261 +#include <xen/features.h>
40262 +#include <xen/xencons.h>
40263 +#include <asm/mach-xen/setup_arch_post.h>
40264 +#include <xen/interface/memory.h>
40265 +
40266 +#ifdef CONFIG_XEN
40267 +#include <xen/interface/kexec.h>
40268 +#endif
40269 +
40270 +extern unsigned long start_pfn;
40271 +
40272 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
40273 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
40274 +
40275 +extern char hypercall_page[PAGE_SIZE];
40276 +EXPORT_SYMBOL(hypercall_page);
40277 +
40278 +/* Allows setting of maximum possible memory size  */
40279 +unsigned long xen_override_max_pfn;
40280 +
40281 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
40282 +static struct notifier_block xen_panic_block = {
40283 +       xen_panic_event, NULL, 0 /* try to go last */
40284 +};
40285 +
40286 +unsigned long *phys_to_machine_mapping;
40287 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
40288 +
40289 +EXPORT_SYMBOL(phys_to_machine_mapping);
40290 +
40291 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
40292 +DEFINE_PER_CPU(int, nr_multicall_ents);
40293 +
40294 +/* Raw start-of-day parameters from the hypervisor. */
40295 +start_info_t *xen_start_info;
40296 +EXPORT_SYMBOL(xen_start_info);
40297 +#endif
40298 +
40299 +/*
40300 + * Machine setup..
40301 + */
40302 +
40303 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
40304 +EXPORT_SYMBOL(boot_cpu_data);
40305 +
40306 +unsigned long mmu_cr4_features;
40307 +
40308 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
40309 +int bootloader_type;
40310 +
40311 +unsigned long saved_video_mode;
40312 +
40313 +/* 
40314 + * Early DMI memory
40315 + */
40316 +int dmi_alloc_index;
40317 +char dmi_alloc_data[DMI_MAX_DATA];
40318 +
40319 +/*
40320 + * Setup options
40321 + */
40322 +struct screen_info screen_info;
40323 +EXPORT_SYMBOL(screen_info);
40324 +struct sys_desc_table_struct {
40325 +       unsigned short length;
40326 +       unsigned char table[0];
40327 +};
40328 +
40329 +struct edid_info edid_info;
40330 +EXPORT_SYMBOL_GPL(edid_info);
40331 +#ifdef CONFIG_XEN
40332 +struct e820map machine_e820;
40333 +#endif
40334 +
40335 +extern int root_mountflags;
40336 +
40337 +char command_line[COMMAND_LINE_SIZE];
40338 +
40339 +struct resource standard_io_resources[] = {
40340 +       { .name = "dma1", .start = 0x00, .end = 0x1f,
40341 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40342 +       { .name = "pic1", .start = 0x20, .end = 0x21,
40343 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40344 +       { .name = "timer0", .start = 0x40, .end = 0x43,
40345 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40346 +       { .name = "timer1", .start = 0x50, .end = 0x53,
40347 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40348 +       { .name = "keyboard", .start = 0x60, .end = 0x6f,
40349 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40350 +       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
40351 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40352 +       { .name = "pic2", .start = 0xa0, .end = 0xa1,
40353 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40354 +       { .name = "dma2", .start = 0xc0, .end = 0xdf,
40355 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40356 +       { .name = "fpu", .start = 0xf0, .end = 0xff,
40357 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
40358 +};
40359 +
40360 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
40361 +
40362 +struct resource data_resource = {
40363 +       .name = "Kernel data",
40364 +       .start = 0,
40365 +       .end = 0,
40366 +       .flags = IORESOURCE_RAM,
40367 +};
40368 +struct resource code_resource = {
40369 +       .name = "Kernel code",
40370 +       .start = 0,
40371 +       .end = 0,
40372 +       .flags = IORESOURCE_RAM,
40373 +};
40374 +
40375 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
40376 +
40377 +static struct resource system_rom_resource = {
40378 +       .name = "System ROM",
40379 +       .start = 0xf0000,
40380 +       .end = 0xfffff,
40381 +       .flags = IORESOURCE_ROM,
40382 +};
40383 +
40384 +static struct resource extension_rom_resource = {
40385 +       .name = "Extension ROM",
40386 +       .start = 0xe0000,
40387 +       .end = 0xeffff,
40388 +       .flags = IORESOURCE_ROM,
40389 +};
40390 +
40391 +static struct resource adapter_rom_resources[] = {
40392 +       { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
40393 +               .flags = IORESOURCE_ROM },
40394 +       { .name = "Adapter ROM", .start = 0, .end = 0,
40395 +               .flags = IORESOURCE_ROM },
40396 +       { .name = "Adapter ROM", .start = 0, .end = 0,
40397 +               .flags = IORESOURCE_ROM },
40398 +       { .name = "Adapter ROM", .start = 0, .end = 0,
40399 +               .flags = IORESOURCE_ROM },
40400 +       { .name = "Adapter ROM", .start = 0, .end = 0,
40401 +               .flags = IORESOURCE_ROM },
40402 +       { .name = "Adapter ROM", .start = 0, .end = 0,
40403 +               .flags = IORESOURCE_ROM }
40404 +};
40405 +
40406 +static struct resource video_rom_resource = {
40407 +       .name = "Video ROM",
40408 +       .start = 0xc0000,
40409 +       .end = 0xc7fff,
40410 +       .flags = IORESOURCE_ROM,
40411 +};
40412 +
40413 +static struct resource video_ram_resource = {
40414 +       .name = "Video RAM area",
40415 +       .start = 0xa0000,
40416 +       .end = 0xbffff,
40417 +       .flags = IORESOURCE_RAM,
40418 +};
40419 +
40420 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
40421 +
40422 +static int __init romchecksum(unsigned char *rom, unsigned long length)
40423 +{
40424 +       unsigned char *p, sum = 0;
40425 +
40426 +       for (p = rom; p < rom + length; p++)
40427 +               sum += *p;
40428 +       return sum == 0;
40429 +}
40430 +
40431 +static void __init probe_roms(void)
40432 +{
40433 +       unsigned long start, length, upper;
40434 +       unsigned char *rom;
40435 +       int           i;
40436 +
40437 +#ifdef CONFIG_XEN
40438 +       /* Nothing to do if not running in dom0. */
40439 +       if (!is_initial_xendomain())
40440 +               return;
40441 +#endif
40442 +
40443 +       /* video rom */
40444 +       upper = adapter_rom_resources[0].start;
40445 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
40446 +               rom = isa_bus_to_virt(start);
40447 +               if (!romsignature(rom))
40448 +                       continue;
40449 +
40450 +               video_rom_resource.start = start;
40451 +
40452 +               /* 0 < length <= 0x7f * 512, historically */
40453 +               length = rom[2] * 512;
40454 +
40455 +               /* if checksum okay, trust length byte */
40456 +               if (length && romchecksum(rom, length))
40457 +                       video_rom_resource.end = start + length - 1;
40458 +
40459 +               request_resource(&iomem_resource, &video_rom_resource);
40460 +               break;
40461 +                       }
40462 +
40463 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
40464 +       if (start < upper)
40465 +               start = upper;
40466 +
40467 +       /* system rom */
40468 +       request_resource(&iomem_resource, &system_rom_resource);
40469 +       upper = system_rom_resource.start;
40470 +
40471 +       /* check for extension rom (ignore length byte!) */
40472 +       rom = isa_bus_to_virt(extension_rom_resource.start);
40473 +       if (romsignature(rom)) {
40474 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
40475 +               if (romchecksum(rom, length)) {
40476 +                       request_resource(&iomem_resource, &extension_rom_resource);
40477 +                       upper = extension_rom_resource.start;
40478 +               }
40479 +       }
40480 +
40481 +       /* check for adapter roms on 2k boundaries */
40482 +       for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
40483 +            start += 2048) {
40484 +               rom = isa_bus_to_virt(start);
40485 +               if (!romsignature(rom))
40486 +                       continue;
40487 +
40488 +               /* 0 < length <= 0x7f * 512, historically */
40489 +               length = rom[2] * 512;
40490 +
40491 +               /* but accept any length that fits if checksum okay */
40492 +               if (!length || start + length > upper || !romchecksum(rom, length))
40493 +                       continue;
40494 +
40495 +               adapter_rom_resources[i].start = start;
40496 +               adapter_rom_resources[i].end = start + length - 1;
40497 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
40498 +
40499 +               start = adapter_rom_resources[i++].end & ~2047UL;
40500 +       }
40501 +}
40502 +
40503 +#ifdef CONFIG_PROC_VMCORE
40504 +/* elfcorehdr= specifies the location of elf core header
40505 + * stored by the crashed kernel. This option will be passed
40506 + * by kexec loader to the capture kernel.
40507 + */
40508 +static int __init setup_elfcorehdr(char *arg)
40509 +{
40510 +       char *end;
40511 +       if (!arg)
40512 +               return -EINVAL;
40513 +       elfcorehdr_addr = memparse(arg, &end);
40514 +       return end > arg ? 0 : -EINVAL;
40515 +}
40516 +early_param("elfcorehdr", setup_elfcorehdr);
40517 +#endif
40518 +
40519 +#ifndef CONFIG_NUMA
40520 +static void __init
40521 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
40522 +{
40523 +       unsigned long bootmap_size, bootmap;
40524 +
40525 +       bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
40526 +       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
40527 +       if (bootmap == -1L)
40528 +               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
40529 +       bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
40530 +       e820_register_active_regions(0, start_pfn, end_pfn);
40531 +#ifdef CONFIG_XEN
40532 +       free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
40533 +#else
40534 +       free_bootmem_with_active_regions(0, end_pfn);
40535 +#endif
40536 +       reserve_bootmem(bootmap, bootmap_size);
40537 +} 
40538 +#endif
40539 +
40540 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
40541 +struct edd edd;
40542 +#ifdef CONFIG_EDD_MODULE
40543 +EXPORT_SYMBOL(edd);
40544 +#endif
40545 +/**
40546 + * copy_edd() - Copy the BIOS EDD information
40547 + *              from boot_params into a safe place.
40548 + *
40549 + */
40550 +static inline void copy_edd(void)
40551 +{
40552 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
40553 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
40554 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
40555 +     edd.edd_info_nr = EDD_NR;
40556 +}
40557 +#else
40558 +static inline void copy_edd(void)
40559 +{
40560 +}
40561 +#endif
40562 +
40563 +#ifndef CONFIG_XEN
40564 +#define EBDA_ADDR_POINTER 0x40E
40565 +
40566 +unsigned __initdata ebda_addr;
40567 +unsigned __initdata ebda_size;
40568 +
40569 +static void discover_ebda(void)
40570 +{
40571 +       /*
40572 +        * there is a real-mode segmented pointer pointing to the 
40573 +        * 4K EBDA area at 0x40E
40574 +        */
40575 +       ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
40576 +       ebda_addr <<= 4;
40577 +
40578 +       ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
40579 +
40580 +       /* Round EBDA up to pages */
40581 +       if (ebda_size == 0)
40582 +               ebda_size = 1;
40583 +       ebda_size <<= 10;
40584 +       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
40585 +       if (ebda_size > 64*1024)
40586 +               ebda_size = 64*1024;
40587 +}
40588 +#endif
40589 +
40590 +void __init setup_arch(char **cmdline_p)
40591 +{
40592 +       printk(KERN_INFO "Command line: %s\n", saved_command_line);
40593 +
40594 +#ifdef CONFIG_XEN
40595 +       /* Register a call for panic conditions. */
40596 +       atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
40597 +
40598 +       ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); 
40599 +       screen_info = SCREEN_INFO;
40600 +
40601 +       if (is_initial_xendomain()) {
40602 +               /* This is drawn from a dump from vgacon:startup in
40603 +                * standard Linux. */
40604 +               screen_info.orig_video_mode = 3;
40605 +               screen_info.orig_video_isVGA = 1;
40606 +               screen_info.orig_video_lines = 25;
40607 +               screen_info.orig_video_cols = 80;
40608 +               screen_info.orig_video_ega_bx = 3;
40609 +               screen_info.orig_video_points = 16;
40610 +               screen_info.orig_y = screen_info.orig_video_lines - 1;
40611 +               if (xen_start_info->console.dom0.info_size >=
40612 +                   sizeof(struct dom0_vga_console_info)) {
40613 +                       const struct dom0_vga_console_info *info =
40614 +                               (struct dom0_vga_console_info *)(
40615 +                                       (char *)xen_start_info +
40616 +                                       xen_start_info->console.dom0.info_off);
40617 +                       dom0_init_screen_info(info);
40618 +               }
40619 +               xen_start_info->console.domU.mfn = 0;
40620 +               xen_start_info->console.domU.evtchn = 0;
40621 +       } else
40622 +               screen_info.orig_video_isVGA = 0;
40623 +#else
40624 +       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
40625 +       screen_info = SCREEN_INFO;
40626 +#endif /* !CONFIG_XEN */
40627 +
40628 +       edid_info = EDID_INFO;
40629 +       saved_video_mode = SAVED_VIDEO_MODE;
40630 +       bootloader_type = LOADER_TYPE;
40631 +
40632 +#ifdef CONFIG_BLK_DEV_RAM
40633 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
40634 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
40635 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
40636 +#endif
40637 +#ifdef CONFIG_XEN
40638 +       setup_xen_features();
40639 +
40640 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
40641 +                            VMASST_TYPE_writable_pagetables);
40642 +
40643 +       ARCH_SETUP
40644 +#endif
40645 +
40646 +       setup_memory_region();
40647 +       copy_edd();
40648 +
40649 +       if (!MOUNT_ROOT_RDONLY)
40650 +               root_mountflags &= ~MS_RDONLY;
40651 +       init_mm.start_code = (unsigned long) &_text;
40652 +       init_mm.end_code = (unsigned long) &_etext;
40653 +       init_mm.end_data = (unsigned long) &_edata;
40654 +       init_mm.brk = (unsigned long) &_end;
40655 +
40656 +       code_resource.start = virt_to_phys(&_text);
40657 +       code_resource.end = virt_to_phys(&_etext)-1;
40658 +       data_resource.start = virt_to_phys(&_etext);
40659 +       data_resource.end = virt_to_phys(&_edata)-1;
40660 +
40661 +       early_identify_cpu(&boot_cpu_data);
40662 +
40663 +       strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
40664 +       *cmdline_p = command_line;
40665 +
40666 +       parse_early_param();
40667 +
40668 +       finish_e820_parsing();
40669 +
40670 +       e820_register_active_regions(0, 0, -1UL);
40671 +       /*
40672 +        * partially used pages are not usable - thus
40673 +        * we are rounding upwards:
40674 +        */
40675 +       end_pfn = e820_end_of_ram();
40676 +       num_physpages = end_pfn;
40677 +
40678 +       check_efer();
40679 +
40680 +#ifndef CONFIG_XEN
40681 +       discover_ebda();
40682 +#endif
40683 +
40684 +       init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
40685 +
40686 +       zap_low_mappings(0);
40687 +
40688 +       /* How many end-of-memory variables you have, grandma! */
40689 +       max_low_pfn = end_pfn;
40690 +       max_pfn = end_pfn;
40691 +       high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
40692 +
40693 +       /* Remove active ranges so rediscovery with NUMA-awareness happens */
40694 +       remove_all_active_ranges();
40695 +
40696 +#ifdef CONFIG_ACPI_NUMA
40697 +       /*
40698 +        * Parse SRAT to discover nodes.
40699 +        */
40700 +       acpi_numa_init();
40701 +#endif
40702 +
40703 +#ifdef CONFIG_NUMA
40704 +       numa_initmem_init(0, end_pfn); 
40705 +#else
40706 +       contig_initmem_init(0, end_pfn);
40707 +#endif
40708 +
40709 +       /* Reserve direct mapping */
40710 +       reserve_bootmem_generic(table_start << PAGE_SHIFT, 
40711 +                               (table_end - table_start) << PAGE_SHIFT);
40712 +
40713 +       /* reserve kernel */
40714 +       reserve_bootmem_generic(__pa_symbol(&_text),
40715 +                               __pa_symbol(&_end) - __pa_symbol(&_text));
40716 +
40717 +#ifdef CONFIG_XEN
40718 +       /* reserve physmap, start info and initial page tables */
40719 +       reserve_bootmem(__pa_symbol(&_end), (table_start<<PAGE_SHIFT)-__pa_symbol(&_end));
40720 +#else
40721 +       /*
40722 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
40723 +        * enabling clean reboots, SMP operation, laptop functions.
40724 +        */
40725 +       reserve_bootmem_generic(0, PAGE_SIZE);
40726 +
40727 +       /* reserve ebda region */
40728 +       if (ebda_addr)
40729 +               reserve_bootmem_generic(ebda_addr, ebda_size);
40730 +#endif
40731 +
40732 +#ifdef CONFIG_SMP
40733 +       /*
40734 +        * But first pinch a few for the stack/trampoline stuff
40735 +        * FIXME: Don't need the extra page at 4K, but need to fix
40736 +        * trampoline before removing it. (see the GDT stuff)
40737 +        */
40738 +       reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
40739 +
40740 +       /* Reserve SMP trampoline */
40741 +       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
40742 +#endif
40743 +
40744 +#ifdef CONFIG_ACPI_SLEEP
40745 +       /*
40746 +        * Reserve low memory region for sleep support.
40747 +        */
40748 +       acpi_reserve_bootmem();
40749 +#endif
40750 +#ifdef CONFIG_XEN
40751 +#ifdef CONFIG_BLK_DEV_INITRD
40752 +       if (xen_start_info->mod_start) {
40753 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
40754 +                       /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
40755 +                       initrd_start = INITRD_START + PAGE_OFFSET;
40756 +                       initrd_end = initrd_start+INITRD_SIZE;
40757 +                       initrd_below_start_ok = 1;
40758 +               } else {
40759 +                       printk(KERN_ERR "initrd extends beyond end of memory "
40760 +                               "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
40761 +                               (unsigned long)(INITRD_START + INITRD_SIZE),
40762 +                               (unsigned long)(end_pfn << PAGE_SHIFT));
40763 +                       initrd_start = 0;
40764 +               }
40765 +       }
40766 +#endif
40767 +#else  /* CONFIG_XEN */
40768 +#ifdef CONFIG_BLK_DEV_INITRD
40769 +       if (LOADER_TYPE && INITRD_START) {
40770 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
40771 +                       reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
40772 +                       initrd_start =
40773 +                               INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
40774 +                       initrd_end = initrd_start+INITRD_SIZE;
40775 +               }
40776 +               else {
40777 +                       printk(KERN_ERR "initrd extends beyond end of memory "
40778 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
40779 +                           (unsigned long)(INITRD_START + INITRD_SIZE),
40780 +                           (unsigned long)(end_pfn << PAGE_SHIFT));
40781 +                       initrd_start = 0;
40782 +               }
40783 +       }
40784 +#endif
40785 +#endif /* !CONFIG_XEN */
40786 +#ifdef CONFIG_KEXEC
40787 +#ifdef CONFIG_XEN
40788 +       xen_machine_kexec_setup_resources();
40789 +#else
40790 +       if (crashk_res.start != crashk_res.end) {
40791 +               reserve_bootmem_generic(crashk_res.start,
40792 +                       crashk_res.end - crashk_res.start + 1);
40793 +       }
40794 +#endif
40795 +#endif
40796 +
40797 +       paging_init();
40798 +       /*
40799 +        * Find and reserve possible boot-time SMP configuration:
40800 +        */
40801 +       find_smp_config();
40802 +#ifdef CONFIG_XEN
40803 +       {
40804 +               int i, j, k, fpp;
40805 +
40806 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
40807 +                       /* Make sure we have a large enough P->M table. */
40808 +                       phys_to_machine_mapping = alloc_bootmem_pages(
40809 +                               end_pfn * sizeof(unsigned long));
40810 +                       memset(phys_to_machine_mapping, ~0,
40811 +                              end_pfn * sizeof(unsigned long));
40812 +                       memcpy(phys_to_machine_mapping,
40813 +                              (unsigned long *)xen_start_info->mfn_list,
40814 +                              xen_start_info->nr_pages * sizeof(unsigned long));
40815 +                       free_bootmem(
40816 +                               __pa(xen_start_info->mfn_list),
40817 +                               PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
40818 +                                               sizeof(unsigned long))));
40819 +
40820 +                       /*
40821 +                        * Initialise the list of the frames that specify the
40822 +                        * list of frames that make up the p2m table. Used by
40823 +                         * save/restore.
40824 +                        */
40825 +                       pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
40826 +                       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
40827 +                               virt_to_mfn(pfn_to_mfn_frame_list_list);
40828 +
40829 +                       fpp = PAGE_SIZE/sizeof(unsigned long);
40830 +                       for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
40831 +                               if ((j % fpp) == 0) {
40832 +                                       k++;
40833 +                                       BUG_ON(k>=fpp);
40834 +                                       pfn_to_mfn_frame_list[k] =
40835 +                                               alloc_bootmem_pages(PAGE_SIZE);
40836 +                                       pfn_to_mfn_frame_list_list[k] =
40837 +                                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
40838 +                                       j=0;
40839 +                               }
40840 +                               pfn_to_mfn_frame_list[k][j] =
40841 +                                       virt_to_mfn(&phys_to_machine_mapping[i]);
40842 +                       }
40843 +                       HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
40844 +               }
40845 +
40846 +       }
40847 +
40848 +       if (is_initial_xendomain())
40849 +               dmi_scan_machine();
40850 +
40851 +#ifdef CONFIG_ACPI
40852 +       if (!is_initial_xendomain()) {
40853 +               acpi_disabled = 1;
40854 +               acpi_ht = 0;
40855 +       }
40856 +#endif
40857 +#endif
40858 +
40859 +#ifndef CONFIG_XEN
40860 +#ifdef CONFIG_PCI
40861 +       early_quirks();
40862 +#endif
40863 +#endif
40864 +
40865 +       /*
40866 +        * set this early, so we dont allocate cpu0
40867 +        * if MADT list doesnt list BSP first
40868 +        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
40869 +        */
40870 +       cpu_set(0, cpu_present_map);
40871 +#ifdef CONFIG_ACPI
40872 +       /*
40873 +        * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
40874 +        * Call this early for SRAT node setup.
40875 +        */
40876 +       acpi_boot_table_init();
40877 +
40878 +       /*
40879 +        * Read APIC and some other early information from ACPI tables.
40880 +        */
40881 +       acpi_boot_init();
40882 +#endif
40883 +
40884 +       init_cpu_to_node();
40885 +
40886 +       /*
40887 +        * get boot-time SMP configuration:
40888 +        */
40889 +       if (smp_found_config)
40890 +               get_smp_config();
40891 +#ifndef CONFIG_XEN
40892 +       init_apic_mappings();
40893 +#endif
40894 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
40895 +       prefill_possible_map();
40896 +#endif
40897 +
40898 +       /*
40899 +        * Request address space for all standard RAM and ROM resources
40900 +        * and also for regions reported as reserved by the e820.
40901 +        */
40902 +       probe_roms();
40903 +#ifdef CONFIG_XEN
40904 +       if (is_initial_xendomain()) {
40905 +               struct xen_memory_map memmap;
40906 +
40907 +               memmap.nr_entries = E820MAX;
40908 +               set_xen_guest_handle(memmap.buffer, machine_e820.map);
40909 +
40910 +               if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
40911 +                       BUG();
40912 +               machine_e820.nr_map = memmap.nr_entries;
40913 +
40914 +               e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
40915 +       }
40916 +#else
40917 +       e820_reserve_resources(e820.map, e820.nr_map);
40918 +#endif
40919 +       e820_mark_nosave_regions();
40920 +
40921 +       request_resource(&iomem_resource, &video_ram_resource);
40922 +
40923 +       {
40924 +       unsigned i;
40925 +       /* request I/O space for devices used on all i[345]86 PCs */
40926 +       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
40927 +               request_resource(&ioport_resource, &standard_io_resources[i]);
40928 +       }
40929 +
40930 +#ifdef CONFIG_XEN
40931 +       if (is_initial_xendomain())
40932 +               e820_setup_gap(machine_e820.map, machine_e820.nr_map);
40933 +#else
40934 +       e820_setup_gap(e820.map, e820.nr_map);
40935 +#endif
40936 +
40937 +#ifdef CONFIG_XEN
40938 +       {
40939 +               struct physdev_set_iopl set_iopl;
40940 +
40941 +               set_iopl.iopl = 1;
40942 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
40943 +
40944 +               if (is_initial_xendomain()) {
40945 +#ifdef CONFIG_VT
40946 +#if defined(CONFIG_VGA_CONSOLE)
40947 +                       conswitchp = &vga_con;
40948 +#elif defined(CONFIG_DUMMY_CONSOLE)
40949 +                       conswitchp = &dummy_con;
40950 +#endif
40951 +#endif
40952 +               } else {
40953 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
40954 +                       conswitchp = &dummy_con;
40955 +#endif
40956 +                }
40957 +       }
40958 +       xencons_early_setup();
40959 +#else  /* CONFIG_XEN */
40960 +
40961 +#ifdef CONFIG_VT
40962 +#if defined(CONFIG_VGA_CONSOLE)
40963 +       conswitchp = &vga_con;
40964 +#elif defined(CONFIG_DUMMY_CONSOLE)
40965 +       conswitchp = &dummy_con;
40966 +#endif
40967 +#endif
40968 +
40969 +#endif /* !CONFIG_XEN */
40970 +}
40971 +
40972 +#ifdef CONFIG_XEN
40973 +static int
40974 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
40975 +{
40976 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
40977 +       /* we're never actually going to get here... */
40978 +       return NOTIFY_DONE;
40979 +}
40980 +#endif /* !CONFIG_XEN */
40981 +
40982 +
40983 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
40984 +{
40985 +       unsigned int *v;
40986 +
40987 +       if (c->extended_cpuid_level < 0x80000004)
40988 +               return 0;
40989 +
40990 +       v = (unsigned int *) c->x86_model_id;
40991 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
40992 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
40993 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
40994 +       c->x86_model_id[48] = 0;
40995 +       return 1;
40996 +}
40997 +
40998 +
40999 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
41000 +{
41001 +       unsigned int n, dummy, eax, ebx, ecx, edx;
41002 +
41003 +       n = c->extended_cpuid_level;
41004 +
41005 +       if (n >= 0x80000005) {
41006 +               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
41007 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
41008 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
41009 +               c->x86_cache_size=(ecx>>24)+(edx>>24);
41010 +               /* On K8 L1 TLB is inclusive, so don't count it */
41011 +               c->x86_tlbsize = 0;
41012 +       }
41013 +
41014 +       if (n >= 0x80000006) {
41015 +               cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
41016 +               ecx = cpuid_ecx(0x80000006);
41017 +               c->x86_cache_size = ecx >> 16;
41018 +               c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
41019 +
41020 +               printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
41021 +               c->x86_cache_size, ecx & 0xFF);
41022 +       }
41023 +
41024 +       if (n >= 0x80000007)
41025 +               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
41026 +       if (n >= 0x80000008) {
41027 +               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
41028 +               c->x86_virt_bits = (eax >> 8) & 0xff;
41029 +               c->x86_phys_bits = eax & 0xff;
41030 +       }
41031 +}
41032 +
41033 +#ifdef CONFIG_NUMA
41034 +static int nearby_node(int apicid)
41035 +{
41036 +       int i;
41037 +       for (i = apicid - 1; i >= 0; i--) {
41038 +               int node = apicid_to_node[i];
41039 +               if (node != NUMA_NO_NODE && node_online(node))
41040 +                       return node;
41041 +       }
41042 +       for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
41043 +               int node = apicid_to_node[i];
41044 +               if (node != NUMA_NO_NODE && node_online(node))
41045 +                       return node;
41046 +       }
41047 +       return first_node(node_online_map); /* Shouldn't happen */
41048 +}
41049 +#endif
41050 +
41051 +/*
41052 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
41053 + * Assumes number of cores is a power of two.
41054 + */
41055 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
41056 +{
41057 +#ifdef CONFIG_SMP
41058 +       unsigned bits;
41059 +#ifdef CONFIG_NUMA
41060 +       int cpu = smp_processor_id();
41061 +       int node = 0;
41062 +       unsigned apicid = hard_smp_processor_id();
41063 +#endif
41064 +       unsigned ecx = cpuid_ecx(0x80000008);
41065 +
41066 +       c->x86_max_cores = (ecx & 0xff) + 1;
41067 +
41068 +       /* CPU telling us the core id bits shift? */
41069 +       bits = (ecx >> 12) & 0xF;
41070 +
41071 +       /* Otherwise recompute */
41072 +       if (bits == 0) {
41073 +               while ((1 << bits) < c->x86_max_cores)
41074 +                       bits++;
41075 +       }
41076 +
41077 +       /* Low order bits define the core id (index of core in socket) */
41078 +       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
41079 +       /* Convert the APIC ID into the socket ID */
41080 +       c->phys_proc_id = phys_pkg_id(bits);
41081 +
41082 +#ifdef CONFIG_NUMA
41083 +       node = c->phys_proc_id;
41084 +       if (apicid_to_node[apicid] != NUMA_NO_NODE)
41085 +               node = apicid_to_node[apicid];
41086 +       if (!node_online(node)) {
41087 +               /* Two possibilities here:
41088 +                  - The CPU is missing memory and no node was created.
41089 +                  In that case try picking one from a nearby CPU
41090 +                  - The APIC IDs differ from the HyperTransport node IDs
41091 +                  which the K8 northbridge parsing fills in.
41092 +                  Assume they are all increased by a constant offset,
41093 +                  but in the same order as the HT nodeids.
41094 +                  If that doesn't result in a usable node fall back to the
41095 +                  path for the previous case.  */
41096 +               int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
41097 +               if (ht_nodeid >= 0 &&
41098 +                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
41099 +                       node = apicid_to_node[ht_nodeid];
41100 +               /* Pick a nearby node */
41101 +               if (!node_online(node))
41102 +                       node = nearby_node(apicid);
41103 +       }
41104 +       numa_set_node(cpu, node);
41105 +
41106 +       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
41107 +#endif
41108 +#endif
41109 +}
41110 +
41111 +static void __cpuinit init_amd(struct cpuinfo_x86 *c)
41112 +{
41113 +       unsigned level;
41114 +
41115 +#ifdef CONFIG_SMP
41116 +       unsigned long value;
41117 +
41118 +       /*
41119 +        * Disable TLB flush filter by setting HWCR.FFDIS on K8
41120 +        * bit 6 of msr C001_0015
41121 +        *
41122 +        * Errata 63 for SH-B3 steppings
41123 +        * Errata 122 for all steppings (F+ have it disabled by default)
41124 +        */
41125 +       if (c->x86 == 15) {
41126 +               rdmsrl(MSR_K8_HWCR, value);
41127 +               value |= 1 << 6;
41128 +               wrmsrl(MSR_K8_HWCR, value);
41129 +       }
41130 +#endif
41131 +
41132 +       /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
41133 +          3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
41134 +       clear_bit(0*32+31, &c->x86_capability);
41135 +       
41136 +       /* On C+ stepping K8 rep microcode works well for copy/memset */
41137 +       level = cpuid_eax(1);
41138 +       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
41139 +               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
41140 +
41141 +       /* Enable workaround for FXSAVE leak */
41142 +       if (c->x86 >= 6)
41143 +               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
41144 +
41145 +       level = get_model_name(c);
41146 +       if (!level) {
41147 +               switch (c->x86) { 
41148 +               case 15:
41149 +                       /* Should distinguish Models here, but this is only
41150 +                          a fallback anyways. */
41151 +                       strcpy(c->x86_model_id, "Hammer");
41152 +                       break; 
41153 +               } 
41154 +       } 
41155 +       display_cacheinfo(c);
41156 +
41157 +       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
41158 +       if (c->x86_power & (1<<8))
41159 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
41160 +
41161 +       /* Multi core CPU? */
41162 +       if (c->extended_cpuid_level >= 0x80000008)
41163 +               amd_detect_cmp(c);
41164 +
41165 +       /* Fix cpuid4 emulation for more */
41166 +       num_cache_leaves = 3;
41167 +
41168 +       /* When there is only one core no need to synchronize RDTSC */
41169 +       if (num_possible_cpus() == 1)
41170 +               set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
41171 +       else
41172 +               clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
41173 +}
41174 +
41175 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
41176 +{
41177 +#ifdef CONFIG_SMP
41178 +       u32     eax, ebx, ecx, edx;
41179 +       int     index_msb, core_bits;
41180 +
41181 +       cpuid(1, &eax, &ebx, &ecx, &edx);
41182 +
41183 +
41184 +       if (!cpu_has(c, X86_FEATURE_HT))
41185 +               return;
41186 +       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
41187 +               goto out;
41188 +
41189 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
41190 +
41191 +       if (smp_num_siblings == 1) {
41192 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
41193 +       } else if (smp_num_siblings > 1 ) {
41194 +
41195 +               if (smp_num_siblings > NR_CPUS) {
41196 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
41197 +                       smp_num_siblings = 1;
41198 +                       return;
41199 +               }
41200 +
41201 +               index_msb = get_count_order(smp_num_siblings);
41202 +               c->phys_proc_id = phys_pkg_id(index_msb);
41203 +
41204 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
41205 +
41206 +               index_msb = get_count_order(smp_num_siblings) ;
41207 +
41208 +               core_bits = get_count_order(c->x86_max_cores);
41209 +
41210 +               c->cpu_core_id = phys_pkg_id(index_msb) &
41211 +                                              ((1 << core_bits) - 1);
41212 +       }
41213 +out:
41214 +       if ((c->x86_max_cores * smp_num_siblings) > 1) {
41215 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
41216 +               printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
41217 +       }
41218 +
41219 +#endif
41220 +}
41221 +
41222 +/*
41223 + * find out the number of processor cores on the die
41224 + */
41225 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
41226 +{
41227 +       unsigned int eax, t;
41228 +
41229 +       if (c->cpuid_level < 4)
41230 +               return 1;
41231 +
41232 +       cpuid_count(4, 0, &eax, &t, &t, &t);
41233 +
41234 +       if (eax & 0x1f)
41235 +               return ((eax >> 26) + 1);
41236 +       else
41237 +               return 1;
41238 +}
41239 +
41240 +static void srat_detect_node(void)
41241 +{
41242 +#ifdef CONFIG_NUMA
41243 +       unsigned node;
41244 +       int cpu = smp_processor_id();
41245 +       int apicid = hard_smp_processor_id();
41246 +
41247 +       /* Don't do the funky fallback heuristics the AMD version employs
41248 +          for now. */
41249 +       node = apicid_to_node[apicid];
41250 +       if (node == NUMA_NO_NODE)
41251 +               node = first_node(node_online_map);
41252 +       numa_set_node(cpu, node);
41253 +
41254 +       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
41255 +#endif
41256 +}
41257 +
41258 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
41259 +{
41260 +       /* Cache sizes */
41261 +       unsigned n;
41262 +
41263 +       init_intel_cacheinfo(c);
41264 +       if (c->cpuid_level > 9 ) {
41265 +               unsigned eax = cpuid_eax(10);
41266 +               /* Check for version and the number of counters */
41267 +               if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
41268 +                       set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
41269 +       }
41270 +
41271 +       n = c->extended_cpuid_level;
41272 +       if (n >= 0x80000008) {
41273 +               unsigned eax = cpuid_eax(0x80000008);
41274 +               c->x86_virt_bits = (eax >> 8) & 0xff;
41275 +               c->x86_phys_bits = eax & 0xff;
41276 +               /* CPUID workaround for Intel 0F34 CPU */
41277 +               if (c->x86_vendor == X86_VENDOR_INTEL &&
41278 +                   c->x86 == 0xF && c->x86_model == 0x3 &&
41279 +                   c->x86_mask == 0x4)
41280 +                       c->x86_phys_bits = 36;
41281 +       }
41282 +
41283 +       if (c->x86 == 15)
41284 +               c->x86_cache_alignment = c->x86_clflush_size * 2;
41285 +       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
41286 +           (c->x86 == 0x6 && c->x86_model >= 0x0e))
41287 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
41288 +       if (c->x86 == 6)
41289 +               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
41290 +       if (c->x86 == 15)
41291 +               set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
41292 +       else
41293 +               clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
41294 +       c->x86_max_cores = intel_num_cpu_cores(c);
41295 +
41296 +       srat_detect_node();
41297 +}
41298 +
41299 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
41300 +{
41301 +       char *v = c->x86_vendor_id;
41302 +
41303 +       if (!strcmp(v, "AuthenticAMD"))
41304 +               c->x86_vendor = X86_VENDOR_AMD;
41305 +       else if (!strcmp(v, "GenuineIntel"))
41306 +               c->x86_vendor = X86_VENDOR_INTEL;
41307 +       else
41308 +               c->x86_vendor = X86_VENDOR_UNKNOWN;
41309 +}
41310 +
41311 +struct cpu_model_info {
41312 +       int vendor;
41313 +       int family;
41314 +       char *model_names[16];
41315 +};
41316 +
41317 +/* Do some early cpuid on the boot CPU to get some parameter that are
41318 +   needed before check_bugs. Everything advanced is in identify_cpu
41319 +   below. */
41320 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
41321 +{
41322 +       u32 tfms;
41323 +
41324 +       c->loops_per_jiffy = loops_per_jiffy;
41325 +       c->x86_cache_size = -1;
41326 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
41327 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
41328 +       c->x86_vendor_id[0] = '\0'; /* Unset */
41329 +       c->x86_model_id[0] = '\0';  /* Unset */
41330 +       c->x86_clflush_size = 64;
41331 +       c->x86_cache_alignment = c->x86_clflush_size;
41332 +       c->x86_max_cores = 1;
41333 +       c->extended_cpuid_level = 0;
41334 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
41335 +
41336 +       /* Get vendor name */
41337 +       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
41338 +             (unsigned int *)&c->x86_vendor_id[0],
41339 +             (unsigned int *)&c->x86_vendor_id[8],
41340 +             (unsigned int *)&c->x86_vendor_id[4]);
41341 +               
41342 +       get_cpu_vendor(c);
41343 +
41344 +       /* Initialize the standard set of capabilities */
41345 +       /* Note that the vendor-specific code below might override */
41346 +
41347 +       /* Intel-defined flags: level 0x00000001 */
41348 +       if (c->cpuid_level >= 0x00000001) {
41349 +               __u32 misc;
41350 +               cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
41351 +                     &c->x86_capability[0]);
41352 +               c->x86 = (tfms >> 8) & 0xf;
41353 +               c->x86_model = (tfms >> 4) & 0xf;
41354 +               c->x86_mask = tfms & 0xf;
41355 +               if (c->x86 == 0xf)
41356 +                       c->x86 += (tfms >> 20) & 0xff;
41357 +               if (c->x86 >= 0x6)
41358 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
41359 +               if (c->x86_capability[0] & (1<<19)) 
41360 +                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
41361 +       } else {
41362 +               /* Have CPUID level 0 only - unheard of */
41363 +               c->x86 = 4;
41364 +       }
41365 +
41366 +#ifdef CONFIG_SMP
41367 +       c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
41368 +#endif
41369 +}
41370 +
41371 +/*
41372 + * This does the hard work of actually picking apart the CPU stuff...
41373 + */
41374 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
41375 +{
41376 +       int i;
41377 +       u32 xlvl;
41378 +
41379 +       early_identify_cpu(c);
41380 +
41381 +       /* AMD-defined flags: level 0x80000001 */
41382 +       xlvl = cpuid_eax(0x80000000);
41383 +       c->extended_cpuid_level = xlvl;
41384 +       if ((xlvl & 0xffff0000) == 0x80000000) {
41385 +               if (xlvl >= 0x80000001) {
41386 +                       c->x86_capability[1] = cpuid_edx(0x80000001);
41387 +                       c->x86_capability[6] = cpuid_ecx(0x80000001);
41388 +               }
41389 +               if (xlvl >= 0x80000004)
41390 +                       get_model_name(c); /* Default name */
41391 +       }
41392 +
41393 +       /* Transmeta-defined flags: level 0x80860001 */
41394 +       xlvl = cpuid_eax(0x80860000);
41395 +       if ((xlvl & 0xffff0000) == 0x80860000) {
41396 +               /* Don't set x86_cpuid_level here for now to not confuse. */
41397 +               if (xlvl >= 0x80860001)
41398 +                       c->x86_capability[2] = cpuid_edx(0x80860001);
41399 +       }
41400 +
41401 +       c->apicid = phys_pkg_id(0);
41402 +
41403 +       /*
41404 +        * Vendor-specific initialization.  In this section we
41405 +        * canonicalize the feature flags, meaning if there are
41406 +        * features a certain CPU supports which CPUID doesn't
41407 +        * tell us, CPUID claiming incorrect flags, or other bugs,
41408 +        * we handle them here.
41409 +        *
41410 +        * At the end of this section, c->x86_capability better
41411 +        * indicate the features this CPU genuinely supports!
41412 +        */
41413 +       switch (c->x86_vendor) {
41414 +       case X86_VENDOR_AMD:
41415 +               init_amd(c);
41416 +               break;
41417 +
41418 +       case X86_VENDOR_INTEL:
41419 +               init_intel(c);
41420 +               break;
41421 +
41422 +       case X86_VENDOR_UNKNOWN:
41423 +       default:
41424 +               display_cacheinfo(c);
41425 +               break;
41426 +       }
41427 +
41428 +       select_idle_routine(c);
41429 +       detect_ht(c); 
41430 +
41431 +       /*
41432 +        * On SMP, boot_cpu_data holds the common feature set between
41433 +        * all CPUs; so make sure that we indicate which features are
41434 +        * common between the CPUs.  The first time this routine gets
41435 +        * executed, c == &boot_cpu_data.
41436 +        */
41437 +       if (c != &boot_cpu_data) {
41438 +               /* AND the already accumulated flags with these */
41439 +               for (i = 0 ; i < NCAPINTS ; i++)
41440 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
41441 +       }
41442 +
41443 +#ifdef CONFIG_X86_MCE
41444 +       mcheck_init(c);
41445 +#endif
41446 +       if (c == &boot_cpu_data)
41447 +               mtrr_bp_init();
41448 +       else
41449 +               mtrr_ap_init();
41450 +#ifdef CONFIG_NUMA
41451 +       numa_add_cpu(smp_processor_id());
41452 +#endif
41453 +}
41454
41455 +
41456 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
41457 +{
41458 +       if (c->x86_model_id[0])
41459 +               printk("%s", c->x86_model_id);
41460 +
41461 +       if (c->x86_mask || c->cpuid_level >= 0) 
41462 +               printk(" stepping %02x\n", c->x86_mask);
41463 +       else
41464 +               printk("\n");
41465 +}
41466 +
41467 +/*
41468 + *     Get CPU information for use by the procfs.
41469 + */
41470 +
41471 +static int show_cpuinfo(struct seq_file *m, void *v)
41472 +{
41473 +       struct cpuinfo_x86 *c = v;
41474 +
41475 +       /* 
41476 +        * These flag bits must match the definitions in <asm/cpufeature.h>.
41477 +        * NULL means this bit is undefined or reserved; either way it doesn't
41478 +        * have meaning as far as Linux is concerned.  Note that it's important
41479 +        * to realize there is a difference between this table and CPUID -- if
41480 +        * applications want to get the raw CPUID data, they should access
41481 +        * /dev/cpu/<cpu_nr>/cpuid instead.
41482 +        */
41483 +       static char *x86_cap_flags[] = {
41484 +               /* Intel-defined */
41485 +               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
41486 +               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
41487 +               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
41488 +               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
41489 +
41490 +               /* AMD-defined */
41491 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41492 +               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
41493 +               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
41494 +               NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
41495 +
41496 +               /* Transmeta-defined */
41497 +               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
41498 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41499 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41500 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41501 +
41502 +               /* Other (Linux-defined) */
41503 +               "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
41504 +               "constant_tsc", NULL, NULL,
41505 +               "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41506 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41507 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41508 +
41509 +               /* Intel-defined (#2) */
41510 +               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
41511 +               "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
41512 +               NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
41513 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41514 +
41515 +               /* VIA/Cyrix/Centaur-defined */
41516 +               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
41517 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41518 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41519 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41520 +
41521 +               /* AMD-defined (#2) */
41522 +               "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
41523 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41524 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41525 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
41526 +       };
41527 +       static char *x86_power_flags[] = { 
41528 +               "ts",   /* temperature sensor */
41529 +               "fid",  /* frequency id control */
41530 +               "vid",  /* voltage id control */
41531 +               "ttp",  /* thermal trip */
41532 +               "tm",
41533 +               "stc",
41534 +               NULL,
41535 +               /* nothing */   /* constant_tsc - moved to flags */
41536 +       };
41537 +
41538 +
41539 +#ifdef CONFIG_SMP
41540 +       if (!cpu_online(c-cpu_data))
41541 +               return 0;
41542 +#endif
41543 +
41544 +       seq_printf(m,"processor\t: %u\n"
41545 +                    "vendor_id\t: %s\n"
41546 +                    "cpu family\t: %d\n"
41547 +                    "model\t\t: %d\n"
41548 +                    "model name\t: %s\n",
41549 +                    (unsigned)(c-cpu_data),
41550 +                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
41551 +                    c->x86,
41552 +                    (int)c->x86_model,
41553 +                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
41554 +       
41555 +       if (c->x86_mask || c->cpuid_level >= 0)
41556 +               seq_printf(m, "stepping\t: %d\n", c->x86_mask);
41557 +       else
41558 +               seq_printf(m, "stepping\t: unknown\n");
41559 +       
41560 +       if (cpu_has(c,X86_FEATURE_TSC)) {
41561 +               unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
41562 +               if (!freq)
41563 +                       freq = cpu_khz;
41564 +               seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
41565 +                            freq / 1000, (freq % 1000));
41566 +       }
41567 +
41568 +       /* Cache size */
41569 +       if (c->x86_cache_size >= 0) 
41570 +               seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
41571 +       
41572 +#ifdef CONFIG_SMP
41573 +       if (smp_num_siblings * c->x86_max_cores > 1) {
41574 +               int cpu = c - cpu_data;
41575 +               seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
41576 +               seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
41577 +               seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
41578 +               seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
41579 +       }
41580 +#endif 
41581 +
41582 +       seq_printf(m,
41583 +               "fpu\t\t: yes\n"
41584 +               "fpu_exception\t: yes\n"
41585 +               "cpuid level\t: %d\n"
41586 +               "wp\t\t: yes\n"
41587 +               "flags\t\t:",
41588 +                  c->cpuid_level);
41589 +
41590 +       { 
41591 +               int i; 
41592 +               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
41593 +                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
41594 +                               seq_printf(m, " %s", x86_cap_flags[i]);
41595 +       }
41596 +               
41597 +       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
41598 +                  c->loops_per_jiffy/(500000/HZ),
41599 +                  (c->loops_per_jiffy/(5000/HZ)) % 100);
41600 +
41601 +       if (c->x86_tlbsize > 0) 
41602 +               seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
41603 +       seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
41604 +       seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
41605 +
41606 +       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
41607 +                  c->x86_phys_bits, c->x86_virt_bits);
41608 +
41609 +       seq_printf(m, "power management:");
41610 +       {
41611 +               unsigned i;
41612 +               for (i = 0; i < 32; i++) 
41613 +                       if (c->x86_power & (1 << i)) {
41614 +                               if (i < ARRAY_SIZE(x86_power_flags) &&
41615 +                                       x86_power_flags[i])
41616 +                                       seq_printf(m, "%s%s",
41617 +                                               x86_power_flags[i][0]?" ":"",
41618 +                                               x86_power_flags[i]);
41619 +                               else
41620 +                                       seq_printf(m, " [%d]", i);
41621 +                       }
41622 +       }
41623 +
41624 +       seq_printf(m, "\n\n");
41625 +
41626 +       return 0;
41627 +}
41628 +
41629 +static void *c_start(struct seq_file *m, loff_t *pos)
41630 +{
41631 +       return *pos < NR_CPUS ? cpu_data + *pos : NULL;
41632 +}
41633 +
41634 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
41635 +{
41636 +       ++*pos;
41637 +       return c_start(m, pos);
41638 +}
41639 +
41640 +static void c_stop(struct seq_file *m, void *v)
41641 +{
41642 +}
41643 +
41644 +struct seq_operations cpuinfo_op = {
41645 +       .start =c_start,
41646 +       .next = c_next,
41647 +       .stop = c_stop,
41648 +       .show = show_cpuinfo,
41649 +};
41650 +
41651 +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
41652 +#include <linux/platform_device.h>
41653 +static __init int add_pcspkr(void)
41654 +{
41655 +       struct platform_device *pd;
41656 +       int ret;
41657 +
41658 +       pd = platform_device_alloc("pcspkr", -1);
41659 +       if (!pd)
41660 +               return -ENOMEM;
41661 +
41662 +       ret = platform_device_add(pd);
41663 +       if (ret)
41664 +               platform_device_put(pd);
41665 +
41666 +       return ret;
41667 +}
41668 +device_initcall(add_pcspkr);
41669 +#endif
41670 diff -ruNp linux-2.6.19/arch/x86_64/kernel/setup64-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/setup64-xen.c
41671 --- linux-2.6.19/arch/x86_64/kernel/setup64-xen.c       1970-01-01 00:00:00.000000000 +0000
41672 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/setup64-xen.c     2007-02-02 19:10:26.000000000 +0000
41673 @@ -0,0 +1,356 @@
41674 +/* 
41675 + * X86-64 specific CPU setup.
41676 + * Copyright (C) 1995  Linus Torvalds
41677 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
41678 + * See setup.c for older changelog.
41679 + *
41680 + * Jun Nakajima <jun.nakajima@intel.com> 
41681 + *   Modified for Xen
41682 + *
41683 + */ 
41684 +#include <linux/init.h>
41685 +#include <linux/kernel.h>
41686 +#include <linux/sched.h>
41687 +#include <linux/string.h>
41688 +#include <linux/bootmem.h>
41689 +#include <linux/bitops.h>
41690 +#include <linux/module.h>
41691 +#include <asm/bootsetup.h>
41692 +#include <asm/pda.h>
41693 +#include <asm/pgtable.h>
41694 +#include <asm/processor.h>
41695 +#include <asm/desc.h>
41696 +#include <asm/atomic.h>
41697 +#include <asm/mmu_context.h>
41698 +#include <asm/smp.h>
41699 +#include <asm/i387.h>
41700 +#include <asm/percpu.h>
41701 +#include <asm/proto.h>
41702 +#include <asm/sections.h>
41703 +#ifdef CONFIG_XEN
41704 +#include <asm/hypervisor.h>
41705 +#endif
41706 +
41707 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
41708 +
41709 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
41710 +
41711 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
41712 +EXPORT_SYMBOL(_cpu_pda);
41713 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
41714 +
41715 +#ifndef CONFIG_X86_NO_IDT
41716 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 
41717 +#endif
41718 +
41719 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
41720 +
41721 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
41722 +EXPORT_SYMBOL(__supported_pte_mask);
41723 +static int do_not_nx __cpuinitdata = 0;
41724 +
41725 +/* noexec=on|off
41726 +Control non executable mappings for 64bit processes.
41727 +
41728 +on     Enable(default)
41729 +off    Disable
41730 +*/ 
41731 +static int __init nonx_setup(char *str)
41732 +{
41733 +       if (!str)
41734 +               return -EINVAL;
41735 +       if (!strncmp(str, "on", 2)) {
41736 +                __supported_pte_mask |= _PAGE_NX; 
41737 +               do_not_nx = 0; 
41738 +       } else if (!strncmp(str, "off", 3)) {
41739 +               do_not_nx = 1;
41740 +               __supported_pte_mask &= ~_PAGE_NX;
41741 +        }
41742 +       return 0;
41743 +} 
41744 +early_param("noexec", nonx_setup);
41745 +
41746 +int force_personality32 = 0; 
41747 +
41748 +/* noexec32=on|off
41749 +Control non executable heap for 32bit processes.
41750 +To control the stack too use noexec=off
41751 +
41752 +on     PROT_READ does not imply PROT_EXEC for 32bit processes
41753 +off    PROT_READ implies PROT_EXEC (default)
41754 +*/
41755 +static int __init nonx32_setup(char *str)
41756 +{
41757 +       if (!strcmp(str, "on"))
41758 +               force_personality32 &= ~READ_IMPLIES_EXEC;
41759 +       else if (!strcmp(str, "off"))
41760 +               force_personality32 |= READ_IMPLIES_EXEC;
41761 +       return 1;
41762 +}
41763 +__setup("noexec32=", nonx32_setup);
41764 +
41765 +/*
41766 + * Great future plan:
41767 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
41768 + * Always point %gs to its beginning
41769 + */
41770 +void __init setup_per_cpu_areas(void)
41771 +{ 
41772 +       int i;
41773 +       unsigned long size;
41774 +
41775 +#ifdef CONFIG_HOTPLUG_CPU
41776 +       prefill_possible_map();
41777 +#endif
41778 +
41779 +       /* Copy section for each CPU (we discard the original) */
41780 +       size = PERCPU_ENOUGH_ROOM;
41781 +
41782 +       printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
41783 +       for_each_cpu_mask (i, cpu_possible_map) {
41784 +               char *ptr;
41785 +
41786 +               if (!NODE_DATA(cpu_to_node(i))) {
41787 +                       printk("cpu with no node %d, num_online_nodes %d\n",
41788 +                              i, num_online_nodes());
41789 +                       ptr = alloc_bootmem(size);
41790 +               } else { 
41791 +                       ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
41792 +               }
41793 +               if (!ptr)
41794 +                       panic("Cannot allocate cpu data for CPU %d\n", i);
41795 +               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
41796 +               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
41797 +       }
41798 +} 
41799 +
41800 +#ifdef CONFIG_XEN
41801 +static void switch_pt(void)
41802 +{
41803 +       xen_pt_switch(__pa(init_level4_pgt));
41804 +        xen_new_user_pt(__pa(init_level4_user_pgt));
41805 +}
41806 +
41807 +void __cpuinit cpu_gdt_init(struct desc_ptr *gdt_descr)
41808 +{
41809 +       unsigned long frames[16];
41810 +       unsigned long va;
41811 +       int f;
41812 +
41813 +       for (va = gdt_descr->address, f = 0;
41814 +            va < gdt_descr->address + gdt_descr->size;
41815 +            va += PAGE_SIZE, f++) {
41816 +               frames[f] = virt_to_mfn(va);
41817 +               make_page_readonly(
41818 +                       (void *)va, XENFEAT_writable_descriptor_tables);
41819 +       }
41820 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size /
41821 +                               sizeof (struct desc_struct)))
41822 +               BUG();
41823 +}
41824 +#else
41825 +static void switch_pt(void)
41826 +{
41827 +       asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
41828 +}
41829 +
41830 +void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
41831 +{
41832 +       asm volatile("lgdt %0" :: "m" (*gdt_descr));
41833 +       asm volatile("lidt %0" :: "m" (idt_descr));
41834 +}
41835 +#endif
41836 +
41837 +void pda_init(int cpu)
41838 +{ 
41839 +       struct x8664_pda *pda = cpu_pda(cpu);
41840 +
41841 +       /* Setup up data that may be needed in __get_free_pages early */
41842 +       asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
41843 +#ifndef CONFIG_XEN
41844 +       /* Memory clobbers used to order PDA accessed */
41845 +       mb();
41846 +       wrmsrl(MSR_GS_BASE, pda);
41847 +       mb();
41848 +#else
41849 +       HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda);
41850 +#endif
41851 +       pda->cpunumber = cpu; 
41852 +       pda->irqcount = -1;
41853 +       pda->kernelstack = 
41854 +               (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
41855 +       pda->active_mm = &init_mm;
41856 +       pda->mmu_state = 0;
41857 +
41858 +       if (cpu == 0) {
41859 +#ifdef CONFIG_XEN
41860 +               xen_init_pt();
41861 +#endif
41862 +               /* others are initialized in smpboot.c */
41863 +               pda->pcurrent = &init_task;
41864 +               pda->irqstackptr = boot_cpu_stack; 
41865 +       } else {
41866 +               pda->irqstackptr = (char *)
41867 +                       __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
41868 +               if (!pda->irqstackptr)
41869 +                       panic("cannot allocate irqstack for cpu %d", cpu); 
41870 +       }
41871 +
41872 +       switch_pt();
41873 +
41874 +       pda->irqstackptr += IRQSTACKSIZE-64;
41875 +} 
41876 +
41877 +#ifndef CONFIG_X86_NO_TSS
41878 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
41879 +__attribute__((section(".bss.page_aligned")));
41880 +#endif
41881 +
41882 +/* May not be marked __init: used by software suspend */
41883 +void syscall_init(void)
41884 +{
41885 +#ifndef CONFIG_XEN
41886 +       /* 
41887 +        * LSTAR and STAR live in a bit strange symbiosis.
41888 +        * They both write to the same internal register. STAR allows to set CS/DS
41889 +        * but only a 32bit target. LSTAR sets the 64bit rip.    
41890 +        */ 
41891 +       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
41892 +       wrmsrl(MSR_LSTAR, system_call); 
41893 +
41894 +       /* Flags to clear on syscall */
41895 +       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
41896 +#endif
41897 +#ifdef CONFIG_IA32_EMULATION                   
41898 +       syscall32_cpu_init ();
41899 +#endif
41900 +}
41901 +
41902 +void __cpuinit check_efer(void)
41903 +{
41904 +       unsigned long efer;
41905 +
41906 +       rdmsrl(MSR_EFER, efer); 
41907 +        if (!(efer & EFER_NX) || do_not_nx) { 
41908 +                __supported_pte_mask &= ~_PAGE_NX; 
41909 +        }       
41910 +}
41911 +
41912 +unsigned long kernel_eflags;
41913 +
41914 +/*
41915 + * cpu_init() initializes state that is per-CPU. Some data is already
41916 + * initialized (naturally) in the bootstrap process, such as the GDT
41917 + * and IDT. We reload them nevertheless, this function acts as a
41918 + * 'CPU state barrier', nothing should get across.
41919 + * A lot of state is already set up in PDA init.
41920 + */
41921 +void __cpuinit cpu_init (void)
41922 +{
41923 +       int cpu = stack_smp_processor_id();
41924 +#ifndef CONFIG_X86_NO_TSS
41925 +       struct tss_struct *t = &per_cpu(init_tss, cpu);
41926 +       struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
41927 +       unsigned long v; 
41928 +       char *estacks = NULL; 
41929 +       int i;
41930 +#endif
41931 +       struct task_struct *me;
41932 +
41933 +       /* CPU 0 is initialised in head64.c */
41934 +       if (cpu != 0) {
41935 +               pda_init(cpu);
41936 +               zap_low_mappings(cpu);
41937 +       }
41938 +#ifndef CONFIG_X86_NO_TSS
41939 +       else
41940 +               estacks = boot_exception_stacks; 
41941 +#endif
41942 +
41943 +       me = current;
41944 +
41945 +       if (cpu_test_and_set(cpu, cpu_initialized))
41946 +               panic("CPU#%d already initialized!\n", cpu);
41947 +
41948 +       printk("Initializing CPU#%d\n", cpu);
41949 +
41950 +       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
41951 +
41952 +       /*
41953 +        * Initialize the per-CPU GDT with the boot GDT,
41954 +        * and set up the GDT descriptor:
41955 +        */
41956 +#ifndef CONFIG_XEN 
41957 +       if (cpu)
41958 +               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
41959 +#endif
41960 +
41961 +       cpu_gdt_descr[cpu].size = GDT_SIZE;
41962 +       cpu_gdt_init(&cpu_gdt_descr[cpu]);
41963 +
41964 +       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
41965 +       syscall_init();
41966 +
41967 +       wrmsrl(MSR_FS_BASE, 0);
41968 +       wrmsrl(MSR_KERNEL_GS_BASE, 0);
41969 +       barrier(); 
41970 +
41971 +       check_efer();
41972 +
41973 +#ifndef CONFIG_X86_NO_TSS
41974 +       /*
41975 +        * set up and load the per-CPU TSS
41976 +        */
41977 +       for (v = 0; v < N_EXCEPTION_STACKS; v++) {
41978 +               static const unsigned int order[N_EXCEPTION_STACKS] = {
41979 +                       [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
41980 +                       [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
41981 +               };
41982 +               if (cpu) {
41983 +                       estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
41984 +                       if (!estacks)
41985 +                               panic("Cannot allocate exception stack %ld %d\n",
41986 +                                     v, cpu); 
41987 +               }
41988 +               estacks += PAGE_SIZE << order[v];
41989 +               orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
41990 +       }
41991 +
41992 +       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
41993 +       /*
41994 +        * <= is required because the CPU will access up to
41995 +        * 8 bits beyond the end of the IO permission bitmap.
41996 +        */
41997 +       for (i = 0; i <= IO_BITMAP_LONGS; i++)
41998 +               t->io_bitmap[i] = ~0UL;
41999 +#endif
42000 +
42001 +       atomic_inc(&init_mm.mm_count);
42002 +       me->active_mm = &init_mm;
42003 +       if (me->mm)
42004 +               BUG();
42005 +       enter_lazy_tlb(&init_mm, me);
42006 +
42007 +#ifndef CONFIG_X86_NO_TSS
42008 +       set_tss_desc(cpu, t);
42009 +#endif
42010 +#ifndef CONFIG_XEN
42011 +       load_TR_desc();
42012 +#endif
42013 +       load_LDT(&init_mm.context);
42014 +
42015 +       /*
42016 +        * Clear all 6 debug registers:
42017 +        */
42018 +
42019 +       set_debugreg(0UL, 0);
42020 +       set_debugreg(0UL, 1);
42021 +       set_debugreg(0UL, 2);
42022 +       set_debugreg(0UL, 3);
42023 +       set_debugreg(0UL, 6);
42024 +       set_debugreg(0UL, 7);
42025 +
42026 +       fpu_init(); 
42027 +
42028 +       raw_local_save_flags(kernel_eflags);
42029 +}
42030 diff -ruNp linux-2.6.19/arch/x86_64/kernel/smp-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/smp-xen.c
42031 --- linux-2.6.19/arch/x86_64/kernel/smp-xen.c   1970-01-01 00:00:00.000000000 +0000
42032 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/smp-xen.c 2007-02-02 19:10:27.000000000 +0000
42033 @@ -0,0 +1,571 @@
42034 +/*
42035 + *     Intel SMP support routines.
42036 + *
42037 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
42038 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
42039 + *      (c) 2002,2003 Andi Kleen, SuSE Labs.
42040 + *
42041 + *     This code is released under the GNU General Public License version 2 or
42042 + *     later.
42043 + */
42044 +
42045 +#include <linux/init.h>
42046 +
42047 +#include <linux/mm.h>
42048 +#include <linux/delay.h>
42049 +#include <linux/spinlock.h>
42050 +#include <linux/smp_lock.h>
42051 +#include <linux/smp.h>
42052 +#include <linux/kernel_stat.h>
42053 +#include <linux/mc146818rtc.h>
42054 +#include <linux/interrupt.h>
42055 +
42056 +#include <asm/mtrr.h>
42057 +#include <asm/pgalloc.h>
42058 +#include <asm/tlbflush.h>
42059 +#include <asm/mach_apic.h>
42060 +#include <asm/mmu_context.h>
42061 +#include <asm/proto.h>
42062 +#include <asm/apicdef.h>
42063 +#include <asm/idle.h>
42064 +#ifdef CONFIG_XEN
42065 +#include <xen/evtchn.h>
42066 +#endif
42067 +
42068 +#ifndef CONFIG_XEN
42069 +/*
42070 + *     Smarter SMP flushing macros. 
42071 + *             c/o Linus Torvalds.
42072 + *
42073 + *     These mean you can really definitely utterly forget about
42074 + *     writing to user space from interrupts. (Its not allowed anyway).
42075 + *
42076 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
42077 + *
42078 + *     More scalable flush, from Andi Kleen
42079 + *
42080 + *     To avoid global state use 8 different call vectors.
42081 + *     Each CPU uses a specific vector to trigger flushes on other
42082 + *     CPUs. Depending on the received vector the target CPUs look into
42083 + *     the right per cpu variable for the flush data.
42084 + *
42085 + *     With more than 8 CPUs they are hashed to the 8 available
42086 + *     vectors. The limited global vector space forces us to this right now.
42087 + *     In future when interrupts are split into per CPU domains this could be
42088 + *     fixed, at the cost of triggering multiple IPIs in some cases.
42089 + */
42090 +
42091 +union smp_flush_state {
42092 +       struct {
42093 +               cpumask_t flush_cpumask;
42094 +               struct mm_struct *flush_mm;
42095 +               unsigned long flush_va;
42096 +#define FLUSH_ALL      -1ULL
42097 +               spinlock_t tlbstate_lock;
42098 +       };
42099 +       char pad[SMP_CACHE_BYTES];
42100 +} ____cacheline_aligned;
42101 +
42102 +/* State is put into the per CPU data section, but padded
42103 +   to a full cache line because other CPUs can access it and we don't
42104 +   want false sharing in the per cpu data segment. */
42105 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
42106 +#endif
42107 +
42108 +/*
42109 + * We cannot call mmdrop() because we are in interrupt context, 
42110 + * instead update mm->cpu_vm_mask.
42111 + */
42112 +static inline void leave_mm(int cpu)
42113 +{
42114 +       if (read_pda(mmu_state) == TLBSTATE_OK)
42115 +               BUG();
42116 +       cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
42117 +       load_cr3(swapper_pg_dir);
42118 +}
42119 +
42120 +#ifndef CONFIG_XEN
42121 +/*
42122 + *
42123 + * The flush IPI assumes that a thread switch happens in this order:
42124 + * [cpu0: the cpu that switches]
42125 + * 1) switch_mm() either 1a) or 1b)
42126 + * 1a) thread switch to a different mm
42127 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
42128 + *     Stop ipi delivery for the old mm. This is not synchronized with
42129 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
42130 + *     for the wrong mm, and in the worst case we perform a superfluous
42131 + *     tlb flush.
42132 + * 1a2) set cpu mmu_state to TLBSTATE_OK
42133 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
42134 + *     was in lazy tlb mode.
42135 + * 1a3) update cpu active_mm
42136 + *     Now cpu0 accepts tlb flushes for the new mm.
42137 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
42138 + *     Now the other cpus will send tlb flush ipis.
42139 + * 1a4) change cr3.
42140 + * 1b) thread switch without mm change
42141 + *     cpu active_mm is correct, cpu0 already handles
42142 + *     flush ipis.
42143 + * 1b1) set cpu mmu_state to TLBSTATE_OK
42144 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
42145 + *     Atomically set the bit [other cpus will start sending flush ipis],
42146 + *     and test the bit.
42147 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
42148 + * 2) switch %%esp, ie current
42149 + *
42150 + * The interrupt must handle 2 special cases:
42151 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
42152 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
42153 + *   runs in kernel space, the cpu could load tlb entries for user space
42154 + *   pages.
42155 + *
42156 + * The good news is that cpu mmu_state is local to each cpu, no
42157 + * write/read ordering problems.
42158 + */
42159 +
42160 +/*
42161 + * TLB flush IPI:
42162 + *
42163 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
42164 + * 2) Leave the mm if we are in the lazy tlb mode.
42165 + *
42166 + * Interrupts are disabled.
42167 + */
42168 +
42169 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
42170 +{
42171 +       int cpu;
42172 +       int sender;
42173 +       union smp_flush_state *f;
42174 +
42175 +       cpu = smp_processor_id();
42176 +       /*
42177 +        * orig_rax contains the interrupt vector - 256.
42178 +        * Use that to determine where the sender put the data.
42179 +        */
42180 +       sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
42181 +       f = &per_cpu(flush_state, sender);
42182 +
42183 +       if (!cpu_isset(cpu, f->flush_cpumask))
42184 +               goto out;
42185 +               /* 
42186 +                * This was a BUG() but until someone can quote me the
42187 +                * line from the intel manual that guarantees an IPI to
42188 +                * multiple CPUs is retried _only_ on the erroring CPUs
42189 +                * its staying as a return
42190 +                *
42191 +                * BUG();
42192 +                */
42193 +                
42194 +       if (f->flush_mm == read_pda(active_mm)) {
42195 +               if (read_pda(mmu_state) == TLBSTATE_OK) {
42196 +                       if (f->flush_va == FLUSH_ALL)
42197 +                               local_flush_tlb();
42198 +                       else
42199 +                               __flush_tlb_one(f->flush_va);
42200 +               } else
42201 +                       leave_mm(cpu);
42202 +       }
42203 +out:
42204 +       ack_APIC_irq();
42205 +       cpu_clear(cpu, f->flush_cpumask);
42206 +}
42207 +
42208 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
42209 +                                               unsigned long va)
42210 +{
42211 +       int sender;
42212 +       union smp_flush_state *f;
42213 +
42214 +       /* Caller has disabled preemption */
42215 +       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
42216 +       f = &per_cpu(flush_state, sender);
42217 +
42218 +       /* Could avoid this lock when
42219 +          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
42220 +          probably not worth checking this for a cache-hot lock. */
42221 +       spin_lock(&f->tlbstate_lock);
42222 +
42223 +       f->flush_mm = mm;
42224 +       f->flush_va = va;
42225 +       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
42226 +
42227 +       /*
42228 +        * We have to send the IPI only to
42229 +        * CPUs affected.
42230 +        */
42231 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
42232 +
42233 +       while (!cpus_empty(f->flush_cpumask))
42234 +               cpu_relax();
42235 +
42236 +       f->flush_mm = NULL;
42237 +       f->flush_va = 0;
42238 +       spin_unlock(&f->tlbstate_lock);
42239 +}
42240 +
42241 +int __cpuinit init_smp_flush(void)
42242 +{
42243 +       int i;
42244 +       for_each_cpu_mask(i, cpu_possible_map) {
42245 +               spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
42246 +       }
42247 +       return 0;
42248 +}
42249 +
42250 +core_initcall(init_smp_flush);
42251 +       
42252 +void flush_tlb_current_task(void)
42253 +{
42254 +       struct mm_struct *mm = current->mm;
42255 +       cpumask_t cpu_mask;
42256 +
42257 +       preempt_disable();
42258 +       cpu_mask = mm->cpu_vm_mask;
42259 +       cpu_clear(smp_processor_id(), cpu_mask);
42260 +
42261 +       local_flush_tlb();
42262 +       if (!cpus_empty(cpu_mask))
42263 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
42264 +       preempt_enable();
42265 +}
42266 +EXPORT_SYMBOL(flush_tlb_current_task);
42267 +
42268 +void flush_tlb_mm (struct mm_struct * mm)
42269 +{
42270 +       cpumask_t cpu_mask;
42271 +
42272 +       preempt_disable();
42273 +       cpu_mask = mm->cpu_vm_mask;
42274 +       cpu_clear(smp_processor_id(), cpu_mask);
42275 +
42276 +       if (current->active_mm == mm) {
42277 +               if (current->mm)
42278 +                       local_flush_tlb();
42279 +               else
42280 +                       leave_mm(smp_processor_id());
42281 +       }
42282 +       if (!cpus_empty(cpu_mask))
42283 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
42284 +
42285 +       preempt_enable();
42286 +}
42287 +EXPORT_SYMBOL(flush_tlb_mm);
42288 +
42289 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
42290 +{
42291 +       struct mm_struct *mm = vma->vm_mm;
42292 +       cpumask_t cpu_mask;
42293 +
42294 +       preempt_disable();
42295 +       cpu_mask = mm->cpu_vm_mask;
42296 +       cpu_clear(smp_processor_id(), cpu_mask);
42297 +
42298 +       if (current->active_mm == mm) {
42299 +               if(current->mm)
42300 +                       __flush_tlb_one(va);
42301 +                else
42302 +                       leave_mm(smp_processor_id());
42303 +       }
42304 +
42305 +       if (!cpus_empty(cpu_mask))
42306 +               flush_tlb_others(cpu_mask, mm, va);
42307 +
42308 +       preempt_enable();
42309 +}
42310 +EXPORT_SYMBOL(flush_tlb_page);
42311 +
42312 +static void do_flush_tlb_all(void* info)
42313 +{
42314 +       unsigned long cpu = smp_processor_id();
42315 +
42316 +       __flush_tlb_all();
42317 +       if (read_pda(mmu_state) == TLBSTATE_LAZY)
42318 +               leave_mm(cpu);
42319 +}
42320 +
42321 +void flush_tlb_all(void)
42322 +{
42323 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
42324 +}
42325 +#else
42326 +asmlinkage void smp_invalidate_interrupt (void)
42327 +{ return; }
42328 +void flush_tlb_current_task(void)
42329 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
42330 +void flush_tlb_mm (struct mm_struct * mm)
42331 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
42332 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
42333 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
42334 +void flush_tlb_all(void)
42335 +{ xen_tlb_flush_all(); }
42336 +#endif /* Xen */
42337 +
42338 +/*
42339 + * this function sends a 'reschedule' IPI to another CPU.
42340 + * it goes straight through and wastes no time serializing
42341 + * anything. Worst case is that we lose a reschedule ...
42342 + */
42343 +
42344 +void smp_send_reschedule(int cpu)
42345 +{
42346 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
42347 +}
42348 +
42349 +/*
42350 + * Structure and data for smp_call_function(). This is designed to minimise
42351 + * static memory requirements. It also looks cleaner.
42352 + */
42353 +static DEFINE_SPINLOCK(call_lock);
42354 +
42355 +struct call_data_struct {
42356 +       void (*func) (void *info);
42357 +       void *info;
42358 +       atomic_t started;
42359 +       atomic_t finished;
42360 +       int wait;
42361 +};
42362 +
42363 +static struct call_data_struct * call_data;
42364 +
42365 +void lock_ipi_call_lock(void)
42366 +{
42367 +       spin_lock_irq(&call_lock);
42368 +}
42369 +
42370 +void unlock_ipi_call_lock(void)
42371 +{
42372 +       spin_unlock_irq(&call_lock);
42373 +}
42374 +
42375 +/*
42376 + * this function sends a 'generic call function' IPI to one other CPU
42377 + * in the system.
42378 + *
42379 + * cpu is a standard Linux logical CPU number.
42380 + */
42381 +static void
42382 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
42383 +                               int nonatomic, int wait)
42384 +{
42385 +       struct call_data_struct data;
42386 +       int cpus = 1;
42387 +
42388 +       data.func = func;
42389 +       data.info = info;
42390 +       atomic_set(&data.started, 0);
42391 +       data.wait = wait;
42392 +       if (wait)
42393 +               atomic_set(&data.finished, 0);
42394 +
42395 +       call_data = &data;
42396 +       wmb();
42397 +       /* Send a message to all other CPUs and wait for them to respond */
42398 +       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
42399 +
42400 +       /* Wait for response */
42401 +       while (atomic_read(&data.started) != cpus)
42402 +               cpu_relax();
42403 +
42404 +       if (!wait)
42405 +               return;
42406 +
42407 +       while (atomic_read(&data.finished) != cpus)
42408 +               cpu_relax();
42409 +}
42410 +
42411 +/*
42412 + * smp_call_function_single - Run a function on another CPU
42413 + * @func: The function to run. This must be fast and non-blocking.
42414 + * @info: An arbitrary pointer to pass to the function.
42415 + * @nonatomic: Currently unused.
42416 + * @wait: If true, wait until function has completed on other CPUs.
42417 + *
42418 + * Retrurns 0 on success, else a negative status code.
42419 + *
42420 + * Does not return until the remote CPU is nearly ready to execute <func>
42421 + * or is or has executed.
42422 + */
42423 +
42424 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
42425 +       int nonatomic, int wait)
42426 +{
42427 +       /* prevent preemption and reschedule on another processor */
42428 +       int me = get_cpu();
42429 +       if (cpu == me) {
42430 +               put_cpu();
42431 +               return 0;
42432 +       }
42433 +       spin_lock_bh(&call_lock);
42434 +       __smp_call_function_single(cpu, func, info, nonatomic, wait);
42435 +       spin_unlock_bh(&call_lock);
42436 +       put_cpu();
42437 +       return 0;
42438 +}
42439 +
42440 +/*
42441 + * this function sends a 'generic call function' IPI to all other CPUs
42442 + * in the system.
42443 + */
42444 +static void __smp_call_function (void (*func) (void *info), void *info,
42445 +                               int nonatomic, int wait)
42446 +{
42447 +       struct call_data_struct data;
42448 +       int cpus = num_online_cpus()-1;
42449 +
42450 +       if (!cpus)
42451 +               return;
42452 +
42453 +       data.func = func;
42454 +       data.info = info;
42455 +       atomic_set(&data.started, 0);
42456 +       data.wait = wait;
42457 +       if (wait)
42458 +               atomic_set(&data.finished, 0);
42459 +
42460 +       call_data = &data;
42461 +       wmb();
42462 +       /* Send a message to all other CPUs and wait for them to respond */
42463 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
42464 +
42465 +       /* Wait for response */
42466 +       while (atomic_read(&data.started) != cpus)
42467 +#ifndef CONFIG_XEN
42468 +               cpu_relax();
42469 +#else
42470 +               barrier();
42471 +#endif
42472 +
42473 +       if (!wait)
42474 +               return;
42475 +
42476 +       while (atomic_read(&data.finished) != cpus)
42477 +#ifndef CONFIG_XEN
42478 +               cpu_relax();
42479 +#else
42480 +               barrier();
42481 +#endif
42482 +}
42483 +
42484 +/*
42485 + * smp_call_function - run a function on all other CPUs.
42486 + * @func: The function to run. This must be fast and non-blocking.
42487 + * @info: An arbitrary pointer to pass to the function.
42488 + * @nonatomic: currently unused.
42489 + * @wait: If true, wait (atomically) until function has completed on other
42490 + *        CPUs.
42491 + *
42492 + * Returns 0 on success, else a negative status code. Does not return until
42493 + * remote CPUs are nearly ready to execute func or are or have executed.
42494 + *
42495 + * You must not call this function with disabled interrupts or from a
42496 + * hardware interrupt handler or from a bottom half handler.
42497 + * Actually there are a few legal cases, like panic.
42498 + */
42499 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
42500 +                       int wait)
42501 +{
42502 +       spin_lock(&call_lock);
42503 +       __smp_call_function(func,info,nonatomic,wait);
42504 +       spin_unlock(&call_lock);
42505 +       return 0;
42506 +}
42507 +EXPORT_SYMBOL(smp_call_function);
42508 +
42509 +void smp_stop_cpu(void)
42510 +{
42511 +       unsigned long flags;
42512 +       /*
42513 +        * Remove this CPU:
42514 +        */
42515 +       cpu_clear(smp_processor_id(), cpu_online_map);
42516 +       local_irq_save(flags);
42517 +#ifndef CONFIG_XEN
42518 +       disable_local_APIC();
42519 +#endif
42520 +       local_irq_restore(flags);
42521 +}
42522 +
42523 +static void smp_really_stop_cpu(void *dummy)
42524 +{
42525 +       smp_stop_cpu(); 
42526 +       for (;;) 
42527 +               halt();
42528 +} 
42529 +
42530 +void smp_send_stop(void)
42531 +{
42532 +       int nolock = 0;
42533 +#ifndef CONFIG_XEN
42534 +       if (reboot_force)
42535 +               return;
42536 +#endif
42537 +       /* Don't deadlock on the call lock in panic */
42538 +       if (!spin_trylock(&call_lock)) {
42539 +               /* ignore locking because we have panicked anyways */
42540 +               nolock = 1;
42541 +       }
42542 +       __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
42543 +       if (!nolock)
42544 +               spin_unlock(&call_lock);
42545 +
42546 +       local_irq_disable();
42547 +#ifndef CONFIG_XEN
42548 +       disable_local_APIC();
42549 +#endif
42550 +       local_irq_enable();
42551 +}
42552 +
42553 +/*
42554 + * Reschedule call back. Nothing to do,
42555 + * all the work is done automatically when
42556 + * we return from the interrupt.
42557 + */
42558 +#ifndef CONFIG_XEN
42559 +asmlinkage void smp_reschedule_interrupt(void)
42560 +#else
42561 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
42562 +#endif
42563 +{
42564 +#ifndef CONFIG_XEN
42565 +       ack_APIC_irq();
42566 +#else
42567 +       return IRQ_HANDLED;
42568 +#endif
42569 +}
42570 +
42571 +#ifndef CONFIG_XEN
42572 +asmlinkage void smp_call_function_interrupt(void)
42573 +#else
42574 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
42575 +#endif
42576 +{
42577 +       void (*func) (void *info) = call_data->func;
42578 +       void *info = call_data->info;
42579 +       int wait = call_data->wait;
42580 +
42581 +#ifndef CONFIG_XEN
42582 +       ack_APIC_irq();
42583 +#endif
42584 +       /*
42585 +        * Notify initiating CPU that I've grabbed the data and am
42586 +        * about to execute the function
42587 +        */
42588 +       mb();
42589 +       atomic_inc(&call_data->started);
42590 +       /*
42591 +        * At this point the info structure may be out of scope unless wait==1
42592 +        */
42593 +       exit_idle();
42594 +       irq_enter();
42595 +       (*func)(info);
42596 +       irq_exit();
42597 +       if (wait) {
42598 +               mb();
42599 +               atomic_inc(&call_data->finished);
42600 +       }
42601 +#ifdef CONFIG_XEN
42602 +       return IRQ_HANDLED;
42603 +#endif
42604 +}
42605 diff -ruNp linux-2.6.19/arch/x86_64/kernel/traps-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/traps-xen.c
42606 --- linux-2.6.19/arch/x86_64/kernel/traps-xen.c 1970-01-01 00:00:00.000000000 +0000
42607 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/traps-xen.c       2007-02-02 19:10:27.000000000 +0000
42608 @@ -0,0 +1,1245 @@
42609 +/*
42610 + *  linux/arch/x86-64/traps.c
42611 + *
42612 + *  Copyright (C) 1991, 1992  Linus Torvalds
42613 + *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
42614 + *
42615 + *  Pentium III FXSR, SSE support
42616 + *     Gareth Hughes <gareth@valinux.com>, May 2000
42617 + */
42618 +
42619 +/*
42620 + * 'Traps.c' handles hardware traps and faults after we have saved some
42621 + * state in 'entry.S'.
42622 + */
42623 +#include <linux/sched.h>
42624 +#include <linux/kernel.h>
42625 +#include <linux/string.h>
42626 +#include <linux/errno.h>
42627 +#include <linux/ptrace.h>
42628 +#include <linux/timer.h>
42629 +#include <linux/mm.h>
42630 +#include <linux/init.h>
42631 +#include <linux/delay.h>
42632 +#include <linux/spinlock.h>
42633 +#include <linux/interrupt.h>
42634 +#include <linux/kallsyms.h>
42635 +#include <linux/module.h>
42636 +#include <linux/moduleparam.h>
42637 +#include <linux/nmi.h>
42638 +#include <linux/kprobes.h>
42639 +#include <linux/kexec.h>
42640 +#include <linux/unwind.h>
42641 +
42642 +#include <asm/system.h>
42643 +#include <asm/uaccess.h>
42644 +#include <asm/io.h>
42645 +#include <asm/atomic.h>
42646 +#include <asm/debugreg.h>
42647 +#include <asm/desc.h>
42648 +#include <asm/i387.h>
42649 +#include <asm/kdebug.h>
42650 +#include <asm/processor.h>
42651 +#include <asm/unwind.h>
42652 +#include <asm/smp.h>
42653 +#include <asm/pgalloc.h>
42654 +#include <asm/pda.h>
42655 +#include <asm/proto.h>
42656 +#include <asm/nmi.h>
42657 +#include <asm/stacktrace.h>
42658 +
42659 +asmlinkage void divide_error(void);
42660 +asmlinkage void debug(void);
42661 +asmlinkage void nmi(void);
42662 +asmlinkage void int3(void);
42663 +asmlinkage void overflow(void);
42664 +asmlinkage void bounds(void);
42665 +asmlinkage void invalid_op(void);
42666 +asmlinkage void device_not_available(void);
42667 +asmlinkage void double_fault(void);
42668 +asmlinkage void coprocessor_segment_overrun(void);
42669 +asmlinkage void invalid_TSS(void);
42670 +asmlinkage void segment_not_present(void);
42671 +asmlinkage void stack_segment(void);
42672 +asmlinkage void general_protection(void);
42673 +asmlinkage void page_fault(void);
42674 +asmlinkage void coprocessor_error(void);
42675 +asmlinkage void simd_coprocessor_error(void);
42676 +asmlinkage void reserved(void);
42677 +asmlinkage void alignment_check(void);
42678 +asmlinkage void machine_check(void);
42679 +asmlinkage void spurious_interrupt_bug(void);
42680 +
42681 +ATOMIC_NOTIFIER_HEAD(die_chain);
42682 +EXPORT_SYMBOL(die_chain);
42683 +
42684 +int register_die_notifier(struct notifier_block *nb)
42685 +{
42686 +       vmalloc_sync_all();
42687 +       return atomic_notifier_chain_register(&die_chain, nb);
42688 +}
42689 +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
42690 +
42691 +int unregister_die_notifier(struct notifier_block *nb)
42692 +{
42693 +       return atomic_notifier_chain_unregister(&die_chain, nb);
42694 +}
42695 +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
42696 +
42697 +static inline void conditional_sti(struct pt_regs *regs)
42698 +{
42699 +       if (regs->eflags & X86_EFLAGS_IF)
42700 +               local_irq_enable();
42701 +}
42702 +
42703 +static inline void preempt_conditional_sti(struct pt_regs *regs)
42704 +{
42705 +       preempt_disable();
42706 +       if (regs->eflags & X86_EFLAGS_IF)
42707 +               local_irq_enable();
42708 +}
42709 +
42710 +static inline void preempt_conditional_cli(struct pt_regs *regs)
42711 +{
42712 +       if (regs->eflags & X86_EFLAGS_IF)
42713 +               local_irq_disable();
42714 +       /* Make sure to not schedule here because we could be running
42715 +          on an exception stack. */
42716 +       preempt_enable_no_resched();
42717 +}
42718 +
42719 +static int kstack_depth_to_print = 12;
42720 +#ifdef CONFIG_STACK_UNWIND
42721 +static int call_trace = 1;
42722 +#else
42723 +#define call_trace (-1)
42724 +#endif
42725 +
42726 +#ifdef CONFIG_KALLSYMS
42727 +void printk_address(unsigned long address)
42728 +{
42729 +       unsigned long offset = 0, symsize;
42730 +       const char *symname;
42731 +       char *modname;
42732 +       char *delim = ":";
42733 +       char namebuf[128];
42734 +
42735 +       symname = kallsyms_lookup(address, &symsize, &offset,
42736 +                                       &modname, namebuf);
42737 +       if (!symname) {
42738 +               printk(" [<%016lx>]\n", address);
42739 +               return;
42740 +       }
42741 +       if (!modname)
42742 +               modname = delim = "";           
42743 +       printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
42744 +               address, delim, modname, delim, symname, offset, symsize);
42745 +}
42746 +#else
42747 +void printk_address(unsigned long address)
42748 +{
42749 +       printk(" [<%016lx>]\n", address);
42750 +}
42751 +#endif
42752 +
42753 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
42754 +                                       unsigned *usedp, char **idp)
42755 +{
42756 +#ifndef CONFIG_X86_NO_TSS
42757 +       static char ids[][8] = {
42758 +               [DEBUG_STACK - 1] = "#DB",
42759 +               [NMI_STACK - 1] = "NMI",
42760 +               [DOUBLEFAULT_STACK - 1] = "#DF",
42761 +               [STACKFAULT_STACK - 1] = "#SS",
42762 +               [MCE_STACK - 1] = "#MC",
42763 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
42764 +               [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
42765 +#endif
42766 +       };
42767 +       unsigned k;
42768 +
42769 +       /*
42770 +        * Iterate over all exception stacks, and figure out whether
42771 +        * 'stack' is in one of them:
42772 +        */
42773 +       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
42774 +               unsigned long end = per_cpu(orig_ist, cpu).ist[k];
42775 +               /*
42776 +                * Is 'stack' above this exception frame's end?
42777 +                * If yes then skip to the next frame.
42778 +                */
42779 +               if (stack >= end)
42780 +                       continue;
42781 +               /*
42782 +                * Is 'stack' above this exception frame's start address?
42783 +                * If yes then we found the right frame.
42784 +                */
42785 +               if (stack >= end - EXCEPTION_STKSZ) {
42786 +                       /*
42787 +                        * Make sure we only iterate through an exception
42788 +                        * stack once. If it comes up for the second time
42789 +                        * then there's something wrong going on - just
42790 +                        * break out and return NULL:
42791 +                        */
42792 +                       if (*usedp & (1U << k))
42793 +                               break;
42794 +                       *usedp |= 1U << k;
42795 +                       *idp = ids[k];
42796 +                       return (unsigned long *)end;
42797 +               }
42798 +               /*
42799 +                * If this is a debug stack, and if it has a larger size than
42800 +                * the usual exception stacks, then 'stack' might still
42801 +                * be within the lower portion of the debug stack:
42802 +                */
42803 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
42804 +               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
42805 +                       unsigned j = N_EXCEPTION_STACKS - 1;
42806 +
42807 +                       /*
42808 +                        * Black magic. A large debug stack is composed of
42809 +                        * multiple exception stack entries, which we
42810 +                        * iterate through now. Dont look:
42811 +                        */
42812 +                       do {
42813 +                               ++j;
42814 +                               end -= EXCEPTION_STKSZ;
42815 +                               ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
42816 +                       } while (stack < end - EXCEPTION_STKSZ);
42817 +                       if (*usedp & (1U << j))
42818 +                               break;
42819 +                       *usedp |= 1U << j;
42820 +                       *idp = ids[j];
42821 +                       return (unsigned long *)end;
42822 +               }
42823 +#endif
42824 +       }
42825 +#endif
42826 +       return NULL;
42827 +}
42828 +
42829 +struct ops_and_data {
42830 +       struct stacktrace_ops *ops;
42831 +       void *data;
42832 +};
42833 +
42834 +static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
42835 +{
42836 +       struct ops_and_data *oad = (struct ops_and_data *)context;
42837 +       int n = 0;
42838 +
42839 +       while (unwind(info) == 0 && UNW_PC(info)) {
42840 +               n++;
42841 +               oad->ops->address(oad->data, UNW_PC(info));
42842 +               if (arch_unw_user_mode(info))
42843 +                       break;
42844 +       }
42845 +       return n;
42846 +}
42847 +
42848 +/*
42849 + * x86-64 can have upto three kernel stacks: 
42850 + * process stack
42851 + * interrupt stack
42852 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
42853 + */
42854 +
42855 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
42856 +{
42857 +       void *t = (void *)tinfo;
42858 +        return p > t && p < t + THREAD_SIZE - 3;
42859 +}
42860 +
42861 +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
42862 +               struct stacktrace_ops *ops, void *data)
42863 +{
42864 +       const unsigned cpu = smp_processor_id();
42865 +       unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
42866 +       unsigned used = 0;
42867 +       struct thread_info *tinfo;
42868 +
42869 +       if (!tsk)
42870 +               tsk = current;
42871 +
42872 +       if (call_trace >= 0) {
42873 +               int unw_ret = 0;
42874 +               struct unwind_frame_info info;
42875 +               struct ops_and_data oad = { .ops = ops, .data = data };
42876 +
42877 +               if (regs) {
42878 +                       if (unwind_init_frame_info(&info, tsk, regs) == 0)
42879 +                               unw_ret = dump_trace_unwind(&info, &oad);
42880 +               } else if (tsk == current)
42881 +                       unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
42882 +               else {
42883 +                       if (unwind_init_blocked(&info, tsk) == 0)
42884 +                               unw_ret = dump_trace_unwind(&info, &oad);
42885 +               }
42886 +               if (unw_ret > 0) {
42887 +                       if (call_trace == 1 && !arch_unw_user_mode(&info)) {
42888 +                               ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
42889 +                                            UNW_PC(&info));
42890 +                               if ((long)UNW_SP(&info) < 0) {
42891 +                                       ops->warning(data, "Leftover inexact backtrace:\n");
42892 +                                       stack = (unsigned long *)UNW_SP(&info);
42893 +                                       if (!stack)
42894 +                                               return;
42895 +                               } else
42896 +                                       ops->warning(data, "Full inexact backtrace again:\n");
42897 +                       } else if (call_trace >= 1)
42898 +                               return;
42899 +                       else
42900 +                               ops->warning(data, "Full inexact backtrace again:\n");
42901 +               } else
42902 +                       ops->warning(data, "Inexact backtrace:\n");
42903 +       }
42904 +       if (!stack) {
42905 +               unsigned long dummy;
42906 +               stack = &dummy;
42907 +               if (tsk && tsk != current)
42908 +                       stack = (unsigned long *)tsk->thread.rsp;
42909 +       }
42910 +       /*
42911 +        * Align the stack pointer on word boundary, later loops
42912 +        * rely on that (and corruption / debug info bugs can cause
42913 +        * unaligned values here):
42914 +        */
42915 +       stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
42916 +
42917 +       /*
42918 +        * Print function call entries within a stack. 'cond' is the
42919 +        * "end of stackframe" condition, that the 'stack++'
42920 +        * iteration will eventually trigger.
42921 +        */
42922 +#define HANDLE_STACK(cond) \
42923 +       do while (cond) { \
42924 +               unsigned long addr = *stack++; \
42925 +               if (oops_in_progress ?          \
42926 +                       __kernel_text_address(addr) : \
42927 +                       kernel_text_address(addr)) { \
42928 +                       /* \
42929 +                        * If the address is either in the text segment of the \
42930 +                        * kernel, or in the region which contains vmalloc'ed \
42931 +                        * memory, it *may* be the address of a calling \
42932 +                        * routine; if so, print it so that someone tracing \
42933 +                        * down the cause of the crash will be able to figure \
42934 +                        * out the call path that was taken. \
42935 +                        */ \
42936 +                       ops->address(data, addr);   \
42937 +               } \
42938 +       } while (0)
42939 +
42940 +       /*
42941 +        * Print function call entries in all stacks, starting at the
42942 +        * current stack address. If the stacks consist of nested
42943 +        * exceptions
42944 +        */
42945 +       for (;;) {
42946 +               char *id;
42947 +               unsigned long *estack_end;
42948 +               estack_end = in_exception_stack(cpu, (unsigned long)stack,
42949 +                                               &used, &id);
42950 +
42951 +               if (estack_end) {
42952 +                       if (ops->stack(data, id) < 0)
42953 +                               break;
42954 +                       HANDLE_STACK (stack < estack_end);
42955 +                       ops->stack(data, "<EOE>");
42956 +                       /*
42957 +                        * We link to the next stack via the
42958 +                        * second-to-last pointer (index -2 to end) in the
42959 +                        * exception stack:
42960 +                        */
42961 +                       stack = (unsigned long *) estack_end[-2];
42962 +                       continue;
42963 +               }
42964 +               if (irqstack_end) {
42965 +                       unsigned long *irqstack;
42966 +                       irqstack = irqstack_end -
42967 +                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
42968 +
42969 +                       if (stack >= irqstack && stack < irqstack_end) {
42970 +                               if (ops->stack(data, "IRQ") < 0)
42971 +                                       break;
42972 +                               HANDLE_STACK (stack < irqstack_end);
42973 +                               /*
42974 +                                * We link to the next stack (which would be
42975 +                                * the process stack normally) the last
42976 +                                * pointer (index -1 to end) in the IRQ stack:
42977 +                                */
42978 +                               stack = (unsigned long *) (irqstack_end[-1]);
42979 +                               irqstack_end = NULL;
42980 +                               ops->stack(data, "EOI");
42981 +                               continue;
42982 +                       }
42983 +               }
42984 +               break;
42985 +       }
42986 +
42987 +       /*
42988 +        * This handles the process stack:
42989 +        */
42990 +       tinfo = current_thread_info();
42991 +       HANDLE_STACK (valid_stack_ptr(tinfo, stack));
42992 +#undef HANDLE_STACK
42993 +}
42994 +EXPORT_SYMBOL(dump_trace);
42995 +
42996 +static void
42997 +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
42998 +{
42999 +       print_symbol(msg, symbol);
43000 +       printk("\n");
43001 +}
43002 +
43003 +static void print_trace_warning(void *data, char *msg)
43004 +{
43005 +       printk("%s\n", msg);
43006 +}
43007 +
43008 +static int print_trace_stack(void *data, char *name)
43009 +{
43010 +       printk(" <%s> ", name);
43011 +       return 0;
43012 +}
43013 +
43014 +static void print_trace_address(void *data, unsigned long addr)
43015 +{
43016 +       printk_address(addr);
43017 +}
43018 +
43019 +static struct stacktrace_ops print_trace_ops = {
43020 +       .warning = print_trace_warning,
43021 +       .warning_symbol = print_trace_warning_symbol,
43022 +       .stack = print_trace_stack,
43023 +       .address = print_trace_address,
43024 +};
43025 +
43026 +void
43027 +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
43028 +{
43029 +       printk("\nCall Trace:\n");
43030 +       dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
43031 +       printk("\n");
43032 +}
43033 +
43034 +static void
43035 +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
43036 +{
43037 +       unsigned long *stack;
43038 +       int i;
43039 +       const int cpu = smp_processor_id();
43040 +       unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
43041 +       unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
43042 +
43043 +       // debugging aid: "show_stack(NULL, NULL);" prints the
43044 +       // back trace for this cpu.
43045 +
43046 +       if (rsp == NULL) {
43047 +               if (tsk)
43048 +                       rsp = (unsigned long *)tsk->thread.rsp;
43049 +               else
43050 +                       rsp = (unsigned long *)&rsp;
43051 +       }
43052 +
43053 +       stack = rsp;
43054 +       for(i=0; i < kstack_depth_to_print; i++) {
43055 +               if (stack >= irqstack && stack <= irqstack_end) {
43056 +                       if (stack == irqstack_end) {
43057 +                               stack = (unsigned long *) (irqstack_end[-1]);
43058 +                               printk(" <EOI> ");
43059 +                       }
43060 +               } else {
43061 +               if (((long) stack & (THREAD_SIZE-1)) == 0)
43062 +                       break;
43063 +               }
43064 +               if (i && ((i % 4) == 0))
43065 +                       printk("\n");
43066 +               printk(" %016lx", *stack++);
43067 +               touch_nmi_watchdog();
43068 +       }
43069 +       show_trace(tsk, regs, rsp);
43070 +}
43071 +
43072 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
43073 +{
43074 +       _show_stack(tsk, NULL, rsp);
43075 +}
43076 +
43077 +/*
43078 + * The architecture-independent dump_stack generator
43079 + */
43080 +void dump_stack(void)
43081 +{
43082 +       unsigned long dummy;
43083 +       show_trace(NULL, NULL, &dummy);
43084 +}
43085 +
43086 +EXPORT_SYMBOL(dump_stack);
43087 +
43088 +void show_registers(struct pt_regs *regs)
43089 +{
43090 +       int i;
43091 +       int in_kernel = !user_mode(regs);
43092 +       unsigned long rsp;
43093 +       const int cpu = smp_processor_id();
43094 +       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
43095 +
43096 +               rsp = regs->rsp;
43097 +
43098 +       printk("CPU %d ", cpu);
43099 +       __show_regs(regs);
43100 +       printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
43101 +               cur->comm, cur->pid, task_thread_info(cur), cur);
43102 +
43103 +       /*
43104 +        * When in-kernel, we also print out the stack and code at the
43105 +        * time of the fault..
43106 +        */
43107 +       if (in_kernel) {
43108 +
43109 +               printk("Stack: ");
43110 +               _show_stack(NULL, regs, (unsigned long*)rsp);
43111 +
43112 +               printk("\nCode: ");
43113 +               if (regs->rip < PAGE_OFFSET)
43114 +                       goto bad;
43115 +
43116 +               for (i=0; i<20; i++) {
43117 +                       unsigned char c;
43118 +                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
43119 +bad:
43120 +                               printk(" Bad RIP value.");
43121 +                               break;
43122 +                       }
43123 +                       printk("%02x ", c);
43124 +               }
43125 +       }
43126 +       printk("\n");
43127 +}      
43128 +
43129 +void handle_BUG(struct pt_regs *regs)
43130 +{ 
43131 +       struct bug_frame f;
43132 +       long len;
43133 +       const char *prefix = "";
43134 +
43135 +       if (user_mode(regs))
43136 +               return; 
43137 +       if (__copy_from_user(&f, (const void __user *) regs->rip,
43138 +                            sizeof(struct bug_frame)))
43139 +               return; 
43140 +       if (f.filename >= 0 ||
43141 +           f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 
43142 +               return;
43143 +       len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
43144 +       if (len < 0 || len >= PATH_MAX)
43145 +               f.filename = (int)(long)"unmapped filename";
43146 +       else if (len > 50) {
43147 +               f.filename += len - 50;
43148 +               prefix = "...";
43149 +       }
43150 +       printk("----------- [cut here ] --------- [please bite here ] ---------\n");
43151 +       printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
43152 +} 
43153 +
43154 +#ifdef CONFIG_BUG
43155 +void out_of_line_bug(void)
43156 +{ 
43157 +       BUG(); 
43158 +} 
43159 +EXPORT_SYMBOL(out_of_line_bug);
43160 +#endif
43161 +
43162 +static DEFINE_SPINLOCK(die_lock);
43163 +static int die_owner = -1;
43164 +static unsigned int die_nest_count;
43165 +
43166 +unsigned __kprobes long oops_begin(void)
43167 +{
43168 +       int cpu = smp_processor_id();
43169 +       unsigned long flags;
43170 +
43171 +       oops_enter();
43172 +
43173 +       /* racy, but better than risking deadlock. */
43174 +       local_irq_save(flags);
43175 +       if (!spin_trylock(&die_lock)) { 
43176 +               if (cpu == die_owner) 
43177 +                       /* nested oops. should stop eventually */;
43178 +               else
43179 +                       spin_lock(&die_lock);
43180 +       }
43181 +       die_nest_count++;
43182 +       die_owner = cpu;
43183 +       console_verbose();
43184 +       bust_spinlocks(1);
43185 +       return flags;
43186 +}
43187 +
43188 +void __kprobes oops_end(unsigned long flags)
43189 +{ 
43190 +       die_owner = -1;
43191 +       bust_spinlocks(0);
43192 +       die_nest_count--;
43193 +       if (die_nest_count)
43194 +               /* We still own the lock */
43195 +               local_irq_restore(flags);
43196 +       else
43197 +               /* Nest count reaches zero, release the lock. */
43198 +               spin_unlock_irqrestore(&die_lock, flags);
43199 +       if (panic_on_oops)
43200 +               panic("Fatal exception");
43201 +       oops_exit();
43202 +}
43203 +
43204 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
43205 +{
43206 +       static int die_counter;
43207 +       printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
43208 +#ifdef CONFIG_PREEMPT
43209 +       printk("PREEMPT ");
43210 +#endif
43211 +#ifdef CONFIG_SMP
43212 +       printk("SMP ");
43213 +#endif
43214 +#ifdef CONFIG_DEBUG_PAGEALLOC
43215 +       printk("DEBUG_PAGEALLOC");
43216 +#endif
43217 +       printk("\n");
43218 +       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
43219 +       show_registers(regs);
43220 +       /* Executive summary in case the oops scrolled away */
43221 +       printk(KERN_ALERT "RIP ");
43222 +       printk_address(regs->rip); 
43223 +       printk(" RSP <%016lx>\n", regs->rsp); 
43224 +       if (kexec_should_crash(current))
43225 +               crash_kexec(regs);
43226 +}
43227 +
43228 +void die(const char * str, struct pt_regs * regs, long err)
43229 +{
43230 +       unsigned long flags = oops_begin();
43231 +
43232 +       handle_BUG(regs);
43233 +       __die(str, regs, err);
43234 +       oops_end(flags);
43235 +       do_exit(SIGSEGV); 
43236 +}
43237 +
43238 +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
43239 +{
43240 +       unsigned long flags = oops_begin();
43241 +
43242 +       /*
43243 +        * We are in trouble anyway, lets at least try
43244 +        * to get a message out.
43245 +        */
43246 +       printk(str, smp_processor_id());
43247 +       show_registers(regs);
43248 +       if (kexec_should_crash(current))
43249 +               crash_kexec(regs);
43250 +       if (do_panic || panic_on_oops)
43251 +               panic("Non maskable interrupt");
43252 +       oops_end(flags);
43253 +       nmi_exit();
43254 +       local_irq_enable();
43255 +       do_exit(SIGSEGV);
43256 +}
43257 +
43258 +static void __kprobes do_trap(int trapnr, int signr, char *str,
43259 +                             struct pt_regs * regs, long error_code,
43260 +                             siginfo_t *info)
43261 +{
43262 +       struct task_struct *tsk = current;
43263 +
43264 +       tsk->thread.error_code = error_code;
43265 +       tsk->thread.trap_no = trapnr;
43266 +
43267 +       if (user_mode(regs)) {
43268 +               if (exception_trace && unhandled_signal(tsk, signr))
43269 +                       printk(KERN_INFO
43270 +                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
43271 +                              tsk->comm, tsk->pid, str,
43272 +                              regs->rip, regs->rsp, error_code); 
43273 +
43274 +               if (info)
43275 +                       force_sig_info(signr, info, tsk);
43276 +               else
43277 +                       force_sig(signr, tsk);
43278 +               return;
43279 +       }
43280 +
43281 +
43282 +       /* kernel trap */ 
43283 +       {            
43284 +               const struct exception_table_entry *fixup;
43285 +               fixup = search_exception_tables(regs->rip);
43286 +               if (fixup)
43287 +                       regs->rip = fixup->fixup;
43288 +               else    
43289 +                       die(str, regs, error_code);
43290 +               return;
43291 +       }
43292 +}
43293 +
43294 +#define DO_ERROR(trapnr, signr, str, name) \
43295 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
43296 +{ \
43297 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
43298 +                                                       == NOTIFY_STOP) \
43299 +               return; \
43300 +       conditional_sti(regs);                                          \
43301 +       do_trap(trapnr, signr, str, regs, error_code, NULL); \
43302 +}
43303 +
43304 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
43305 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
43306 +{ \
43307 +       siginfo_t info; \
43308 +       info.si_signo = signr; \
43309 +       info.si_errno = 0; \
43310 +       info.si_code = sicode; \
43311 +       info.si_addr = (void __user *)siaddr; \
43312 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
43313 +                                                       == NOTIFY_STOP) \
43314 +               return; \
43315 +       conditional_sti(regs);                                          \
43316 +       do_trap(trapnr, signr, str, regs, error_code, &info); \
43317 +}
43318 +
43319 +DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
43320 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
43321 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
43322 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
43323 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
43324 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
43325 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
43326 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
43327 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
43328 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
43329 +
43330 +/* Runs on IST stack */
43331 +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
43332 +{
43333 +       if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
43334 +                       12, SIGBUS) == NOTIFY_STOP)
43335 +               return;
43336 +       preempt_conditional_sti(regs);
43337 +       do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
43338 +       preempt_conditional_cli(regs);
43339 +}
43340 +
43341 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
43342 +{
43343 +       static const char str[] = "double fault";
43344 +       struct task_struct *tsk = current;
43345 +
43346 +       /* Return not checked because double check cannot be ignored */
43347 +       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
43348 +
43349 +       tsk->thread.error_code = error_code;
43350 +       tsk->thread.trap_no = 8;
43351 +
43352 +       /* This is always a kernel trap and never fixable (and thus must
43353 +          never return). */
43354 +       for (;;)
43355 +               die(str, regs, error_code);
43356 +}
43357 +
43358 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
43359 +                                               long error_code)
43360 +{
43361 +       struct task_struct *tsk = current;
43362 +
43363 +       conditional_sti(regs);
43364 +
43365 +       tsk->thread.error_code = error_code;
43366 +       tsk->thread.trap_no = 13;
43367 +
43368 +       if (user_mode(regs)) {
43369 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV))
43370 +                       printk(KERN_INFO
43371 +                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
43372 +                              tsk->comm, tsk->pid,
43373 +                              regs->rip, regs->rsp, error_code); 
43374 +
43375 +               force_sig(SIGSEGV, tsk);
43376 +               return;
43377 +       } 
43378 +
43379 +       /* kernel gp */
43380 +       {
43381 +               const struct exception_table_entry *fixup;
43382 +               fixup = search_exception_tables(regs->rip);
43383 +               if (fixup) {
43384 +                       regs->rip = fixup->fixup;
43385 +                       return;
43386 +               }
43387 +               if (notify_die(DIE_GPF, "general protection fault", regs,
43388 +                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
43389 +                       return;
43390 +               die("general protection fault", regs, error_code);
43391 +       }
43392 +}
43393 +
43394 +static __kprobes void
43395 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
43396 +{
43397 +       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
43398 +               reason);
43399 +       printk(KERN_EMERG "You probably have a hardware problem with your "
43400 +               "RAM chips\n");
43401 +
43402 +       if (panic_on_unrecovered_nmi)
43403 +               panic("NMI: Not continuing");
43404 +
43405 +       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
43406 +
43407 +#if 0 /* XEN */
43408 +       /* Clear and disable the memory parity error line. */
43409 +       reason = (reason & 0xf) | 4;
43410 +       outb(reason, 0x61);
43411 +#endif /* XEN */
43412 +}
43413 +
43414 +static __kprobes void
43415 +io_check_error(unsigned char reason, struct pt_regs * regs)
43416 +{
43417 +       printk("NMI: IOCK error (debug interrupt?)\n");
43418 +       show_registers(regs);
43419 +
43420 +#if 0 /* XEN */
43421 +       /* Re-enable the IOCK line, wait for a few seconds */
43422 +       reason = (reason & 0xf) | 8;
43423 +       outb(reason, 0x61);
43424 +       mdelay(2000);
43425 +       reason &= ~8;
43426 +       outb(reason, 0x61);
43427 +#endif /* XEN */
43428 +}
43429 +
43430 +static __kprobes void
43431 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
43432 +{
43433 +       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
43434 +               reason);
43435 +       printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
43436 +
43437 +       if (panic_on_unrecovered_nmi)
43438 +               panic("NMI: Not continuing");
43439 +
43440 +       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
43441 +}
43442 +
43443 +/* Runs on IST stack. This code must keep interrupts off all the time.
43444 +   Nested NMIs are prevented by the CPU. */
43445 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
43446 +{
43447 +       unsigned char reason = 0;
43448 +       int cpu;
43449 +
43450 +       cpu = smp_processor_id();
43451 +
43452 +       /* Only the BSP gets external NMIs from the system.  */
43453 +       if (!cpu)
43454 +               reason = get_nmi_reason();
43455 +
43456 +       if (!(reason & 0xc0)) {
43457 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
43458 +                                                               == NOTIFY_STOP)
43459 +                       return;
43460 +               /*
43461 +                * Ok, so this is none of the documented NMI sources,
43462 +                * so it must be the NMI watchdog.
43463 +                */
43464 +               if (nmi_watchdog_tick(regs,reason))
43465 +                       return;
43466 +               if (!do_nmi_callback(regs,cpu))
43467 +                       unknown_nmi_error(reason, regs);
43468 +
43469 +               return;
43470 +       }
43471 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
43472 +               return; 
43473 +
43474 +       /* AK: following checks seem to be broken on modern chipsets. FIXME */
43475 +
43476 +       if (reason & 0x80)
43477 +               mem_parity_error(reason, regs);
43478 +       if (reason & 0x40)
43479 +               io_check_error(reason, regs);
43480 +}
43481 +
43482 +/* runs on IST stack. */
43483 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
43484 +{
43485 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
43486 +               return;
43487 +       }
43488 +       preempt_conditional_sti(regs);
43489 +       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
43490 +       preempt_conditional_cli(regs);
43491 +}
43492 +
43493 +/* Help handler running on IST stack to switch back to user stack
43494 +   for scheduling or signal handling. The actual stack switch is done in
43495 +   entry.S */
43496 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
43497 +{
43498 +       struct pt_regs *regs = eregs;
43499 +       /* Did already sync */
43500 +       if (eregs == (struct pt_regs *)eregs->rsp)
43501 +               ;
43502 +       /* Exception from user space */
43503 +       else if (user_mode(eregs))
43504 +               regs = task_pt_regs(current);
43505 +       /* Exception from kernel and interrupts are enabled. Move to
43506 +          kernel process stack. */
43507 +       else if (eregs->eflags & X86_EFLAGS_IF)
43508 +               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
43509 +       if (eregs != regs)
43510 +               *regs = *eregs;
43511 +       return regs;
43512 +}
43513 +
43514 +/* runs on IST stack. */
43515 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
43516 +                                  unsigned long error_code)
43517 +{
43518 +       unsigned long condition;
43519 +       struct task_struct *tsk = current;
43520 +       siginfo_t info;
43521 +
43522 +       get_debugreg(condition, 6);
43523 +
43524 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
43525 +                                               SIGTRAP) == NOTIFY_STOP)
43526 +               return;
43527 +
43528 +       preempt_conditional_sti(regs);
43529 +
43530 +       /* Mask out spurious debug traps due to lazy DR7 setting */
43531 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
43532 +               if (!tsk->thread.debugreg7) { 
43533 +                       goto clear_dr7;
43534 +               }
43535 +       }
43536 +
43537 +       tsk->thread.debugreg6 = condition;
43538 +
43539 +       /* Mask out spurious TF errors due to lazy TF clearing */
43540 +       if (condition & DR_STEP) {
43541 +               /*
43542 +                * The TF error should be masked out only if the current
43543 +                * process is not traced and if the TRAP flag has been set
43544 +                * previously by a tracing process (condition detected by
43545 +                * the PT_DTRACE flag); remember that the i386 TRAP flag
43546 +                * can be modified by the process itself in user mode,
43547 +                * allowing programs to debug themselves without the ptrace()
43548 +                * interface.
43549 +                */
43550 +                if (!user_mode(regs))
43551 +                       goto clear_TF_reenable;
43552 +               /*
43553 +                * Was the TF flag set by a debugger? If so, clear it now,
43554 +                * so that register information is correct.
43555 +                */
43556 +               if (tsk->ptrace & PT_DTRACE) {
43557 +                       regs->eflags &= ~TF_MASK;
43558 +                       tsk->ptrace &= ~PT_DTRACE;
43559 +               }
43560 +       }
43561 +
43562 +       /* Ok, finally something we can handle */
43563 +       tsk->thread.trap_no = 1;
43564 +       tsk->thread.error_code = error_code;
43565 +       info.si_signo = SIGTRAP;
43566 +       info.si_errno = 0;
43567 +       info.si_code = TRAP_BRKPT;
43568 +       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
43569 +       force_sig_info(SIGTRAP, &info, tsk);
43570 +
43571 +clear_dr7:
43572 +       set_debugreg(0UL, 7);
43573 +       preempt_conditional_cli(regs);
43574 +       return;
43575 +
43576 +clear_TF_reenable:
43577 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
43578 +       regs->eflags &= ~TF_MASK;
43579 +       preempt_conditional_cli(regs);
43580 +}
43581 +
43582 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
43583 +{
43584 +       const struct exception_table_entry *fixup;
43585 +       fixup = search_exception_tables(regs->rip);
43586 +       if (fixup) {
43587 +               regs->rip = fixup->fixup;
43588 +               return 1;
43589 +       }
43590 +       notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
43591 +       /* Illegal floating point operation in the kernel */
43592 +       current->thread.trap_no = trapnr;
43593 +       die(str, regs, 0);
43594 +       return 0;
43595 +}
43596 +
43597 +/*
43598 + * Note that we play around with the 'TS' bit in an attempt to get
43599 + * the correct behaviour even in the presence of the asynchronous
43600 + * IRQ13 behaviour
43601 + */
43602 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
43603 +{
43604 +       void __user *rip = (void __user *)(regs->rip);
43605 +       struct task_struct * task;
43606 +       siginfo_t info;
43607 +       unsigned short cwd, swd;
43608 +
43609 +       conditional_sti(regs);
43610 +       if (!user_mode(regs) &&
43611 +           kernel_math_error(regs, "kernel x87 math error", 16))
43612 +               return;
43613 +
43614 +       /*
43615 +        * Save the info for the exception handler and clear the error.
43616 +        */
43617 +       task = current;
43618 +       save_init_fpu(task);
43619 +       task->thread.trap_no = 16;
43620 +       task->thread.error_code = 0;
43621 +       info.si_signo = SIGFPE;
43622 +       info.si_errno = 0;
43623 +       info.si_code = __SI_FAULT;
43624 +       info.si_addr = rip;
43625 +       /*
43626 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
43627 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
43628 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
43629 +        * fault bit.  We should only be taking one exception at a time,
43630 +        * so if this combination doesn't produce any single exception,
43631 +        * then we have a bad program that isn't synchronizing its FPU usage
43632 +        * and it will suffer the consequences since we won't be able to
43633 +        * fully reproduce the context of the exception
43634 +        */
43635 +       cwd = get_fpu_cwd(task);
43636 +       swd = get_fpu_swd(task);
43637 +       switch (swd & ~cwd & 0x3f) {
43638 +               case 0x000:
43639 +               default:
43640 +                       break;
43641 +               case 0x001: /* Invalid Op */
43642 +                       /*
43643 +                        * swd & 0x240 == 0x040: Stack Underflow
43644 +                        * swd & 0x240 == 0x240: Stack Overflow
43645 +                        * User must clear the SF bit (0x40) if set
43646 +                        */
43647 +                       info.si_code = FPE_FLTINV;
43648 +                       break;
43649 +               case 0x002: /* Denormalize */
43650 +               case 0x010: /* Underflow */
43651 +                       info.si_code = FPE_FLTUND;
43652 +                       break;
43653 +               case 0x004: /* Zero Divide */
43654 +                       info.si_code = FPE_FLTDIV;
43655 +                       break;
43656 +               case 0x008: /* Overflow */
43657 +                       info.si_code = FPE_FLTOVF;
43658 +                       break;
43659 +               case 0x020: /* Precision */
43660 +                       info.si_code = FPE_FLTRES;
43661 +                       break;
43662 +       }
43663 +       force_sig_info(SIGFPE, &info, task);
43664 +}
43665 +
43666 +asmlinkage void bad_intr(void)
43667 +{
43668 +       printk("bad interrupt"); 
43669 +}
43670 +
43671 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
43672 +{
43673 +       void __user *rip = (void __user *)(regs->rip);
43674 +       struct task_struct * task;
43675 +       siginfo_t info;
43676 +       unsigned short mxcsr;
43677 +
43678 +       conditional_sti(regs);
43679 +       if (!user_mode(regs) &&
43680 +               kernel_math_error(regs, "kernel simd math error", 19))
43681 +               return;
43682 +
43683 +       /*
43684 +        * Save the info for the exception handler and clear the error.
43685 +        */
43686 +       task = current;
43687 +       save_init_fpu(task);
43688 +       task->thread.trap_no = 19;
43689 +       task->thread.error_code = 0;
43690 +       info.si_signo = SIGFPE;
43691 +       info.si_errno = 0;
43692 +       info.si_code = __SI_FAULT;
43693 +       info.si_addr = rip;
43694 +       /*
43695 +        * The SIMD FPU exceptions are handled a little differently, as there
43696 +        * is only a single status/control register.  Thus, to determine which
43697 +        * unmasked exception was caught we must mask the exception mask bits
43698 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
43699 +        */
43700 +       mxcsr = get_fpu_mxcsr(task);
43701 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
43702 +               case 0x000:
43703 +               default:
43704 +                       break;
43705 +               case 0x001: /* Invalid Op */
43706 +                       info.si_code = FPE_FLTINV;
43707 +                       break;
43708 +               case 0x002: /* Denormalize */
43709 +               case 0x010: /* Underflow */
43710 +                       info.si_code = FPE_FLTUND;
43711 +                       break;
43712 +               case 0x004: /* Zero Divide */
43713 +                       info.si_code = FPE_FLTDIV;
43714 +                       break;
43715 +               case 0x008: /* Overflow */
43716 +                       info.si_code = FPE_FLTOVF;
43717 +                       break;
43718 +               case 0x020: /* Precision */
43719 +                       info.si_code = FPE_FLTRES;
43720 +                       break;
43721 +       }
43722 +       force_sig_info(SIGFPE, &info, task);
43723 +}
43724 +
43725 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
43726 +{
43727 +}
43728 +
43729 +#if 0
43730 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
43731 +{
43732 +}
43733 +#endif
43734 +
43735 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
43736 +{
43737 +}
43738 +
43739 +/*
43740 + *  'math_state_restore()' saves the current math information in the
43741 + * old math state array, and gets the new ones from the current task
43742 + *
43743 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
43744 + * Don't touch unless you *really* know how it works.
43745 + */
43746 +asmlinkage void math_state_restore(void)
43747 +{
43748 +       struct task_struct *me = current;
43749 +        /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
43750 +
43751 +       if (!used_math())
43752 +               init_fpu(me);
43753 +       restore_fpu_checking(&me->thread.i387.fxsave);
43754 +       task_thread_info(me)->status |= TS_USEDFPU;
43755 +       me->fpu_counter++;
43756 +}
43757 +
43758 +
43759 +/*
43760 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
43761 + * specify <dpl>|4 in the second field.
43762 + */
43763 +static trap_info_t trap_table[] = {
43764 +        {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
43765 +        {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
43766 +        {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
43767 +        {  4, 3|4, __KERNEL_CS, (unsigned long)overflow                   },
43768 +        {  5, 0|4, __KERNEL_CS, (unsigned long)bounds                     },
43769 +        {  6, 0|4, __KERNEL_CS, (unsigned long)invalid_op                 },
43770 +        {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available       },
43771 +        {  9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
43772 +        { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS                },
43773 +        { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present        },
43774 +        { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment              },
43775 +        { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection         },
43776 +        { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault                 },
43777 +        { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug     },
43778 +        { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error          },
43779 +        { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check            },
43780 +#ifdef CONFIG_X86_MCE
43781 +        { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check              },
43782 +#endif
43783 +        { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
43784 +#ifdef CONFIG_IA32_EMULATION
43785 +       { IA32_SYSCALL_VECTOR, 3|4, __KERNEL_CS, (unsigned long)ia32_syscall},
43786 +#endif
43787 +        {  0, 0,           0, 0                                              }
43788 +};
43789 +
43790 +void __init trap_init(void)
43791 +{
43792 +        int ret;
43793 +
43794 +        ret = HYPERVISOR_set_trap_table(trap_table);
43795 +        
43796 +        if (ret) 
43797 +                printk("HYPERVISOR_set_trap_table faild: error %d\n",
43798 +                       ret);
43799 +
43800 +       /*
43801 +        * Should be a barrier for any external CPU state.
43802 +        */
43803 +       cpu_init();
43804 +}
43805 +
43806 +void smp_trap_init(trap_info_t *trap_ctxt)
43807 +{
43808 +       trap_info_t *t = trap_table;
43809 +
43810 +       for (t = trap_table; t->address; t++) {
43811 +               trap_ctxt[t->vector].flags = t->flags;
43812 +               trap_ctxt[t->vector].cs = t->cs;
43813 +               trap_ctxt[t->vector].address = t->address;
43814 +       }
43815 +}
43816 +
43817 +
43818 +static int __init oops_setup(char *s)
43819 +{ 
43820 +       if (!s)
43821 +               return -EINVAL;
43822 +       if (!strcmp(s, "panic"))
43823 +               panic_on_oops = 1;
43824 +       return 0;
43825 +} 
43826 +early_param("oops", oops_setup);
43827 +
43828 +static int __init kstack_setup(char *s)
43829 +{
43830 +       if (!s)
43831 +               return -EINVAL;
43832 +       kstack_depth_to_print = simple_strtoul(s,NULL,0);
43833 +       return 0;
43834 +}
43835 +early_param("kstack", kstack_setup);
43836 +
43837 +#ifdef CONFIG_STACK_UNWIND
43838 +static int __init call_trace_setup(char *s)
43839 +{
43840 +       if (!s)
43841 +               return -EINVAL;
43842 +       if (strcmp(s, "old") == 0)
43843 +               call_trace = -1;
43844 +       else if (strcmp(s, "both") == 0)
43845 +               call_trace = 0;
43846 +       else if (strcmp(s, "newfallback") == 0)
43847 +               call_trace = 1;
43848 +       else if (strcmp(s, "new") == 0)
43849 +               call_trace = 2;
43850 +       return 0;
43851 +}
43852 +early_param("call_trace", call_trace_setup);
43853 +#endif
43854 diff -ruNp linux-2.6.19/arch/x86_64/kernel/vmlinux.lds.S linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/vmlinux.lds.S
43855 --- linux-2.6.19/arch/x86_64/kernel/vmlinux.lds.S       2006-11-29 21:57:37.000000000 +0000
43856 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/vmlinux.lds.S     2007-02-02 19:10:27.000000000 +0000
43857 @@ -235,4 +235,6 @@ SECTIONS
43858    STABS_DEBUG
43859  
43860    DWARF_DEBUG
43861 +
43862 +  NOTES
43863  }
43864 diff -ruNp linux-2.6.19/arch/x86_64/kernel/vsyscall-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/vsyscall-xen.c
43865 --- linux-2.6.19/arch/x86_64/kernel/vsyscall-xen.c      1970-01-01 00:00:00.000000000 +0000
43866 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/vsyscall-xen.c    2007-02-02 19:10:27.000000000 +0000
43867 @@ -0,0 +1,329 @@
43868 +/*
43869 + *  linux/arch/x86_64/kernel/vsyscall.c
43870 + *
43871 + *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
43872 + *  Copyright 2003 Andi Kleen, SuSE Labs.
43873 + *
43874 + *  Thanks to hpa@transmeta.com for some useful hint.
43875 + *  Special thanks to Ingo Molnar for his early experience with
43876 + *  a different vsyscall implementation for Linux/IA32 and for the name.
43877 + *
43878 + *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
43879 + *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
43880 + *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
43881 + *  jumping out of line if necessary. We cannot add more with this
43882 + *  mechanism because older kernels won't return -ENOSYS.
43883 + *  If we want more than four we need a vDSO.
43884 + *
43885 + *  Note: the concept clashes with user mode linux. If you use UML and
43886 + *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
43887 + */
43888 +
43889 +#include <linux/time.h>
43890 +#include <linux/init.h>
43891 +#include <linux/kernel.h>
43892 +#include <linux/timer.h>
43893 +#include <linux/seqlock.h>
43894 +#include <linux/jiffies.h>
43895 +#include <linux/sysctl.h>
43896 +#include <linux/getcpu.h>
43897 +#include <linux/cpu.h>
43898 +#include <linux/smp.h>
43899 +#include <linux/notifier.h>
43900 +
43901 +#include <asm/vsyscall.h>
43902 +#include <asm/pgtable.h>
43903 +#include <asm/page.h>
43904 +#include <asm/fixmap.h>
43905 +#include <asm/errno.h>
43906 +#include <asm/io.h>
43907 +#include <asm/segment.h>
43908 +#include <asm/desc.h>
43909 +#include <asm/topology.h>
43910 +
43911 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
43912 +
43913 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
43914 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
43915 +int __vgetcpu_mode __section_vgetcpu_mode;
43916 +
43917 +#include <asm/unistd.h>
43918 +
43919 +static __always_inline void timeval_normalize(struct timeval * tv)
43920 +{
43921 +       time_t __sec;
43922 +
43923 +       __sec = tv->tv_usec / 1000000;
43924 +       if (__sec) {
43925 +               tv->tv_usec %= 1000000;
43926 +               tv->tv_sec += __sec;
43927 +       }
43928 +}
43929 +
43930 +static __always_inline void do_vgettimeofday(struct timeval * tv)
43931 +{
43932 +       long sequence, t;
43933 +       unsigned long sec, usec;
43934 +
43935 +       do {
43936 +               sequence = read_seqbegin(&__xtime_lock);
43937 +               
43938 +               sec = __xtime.tv_sec;
43939 +               usec = __xtime.tv_nsec / 1000;
43940 +
43941 +               if (__vxtime.mode != VXTIME_HPET) {
43942 +                       t = get_cycles_sync();
43943 +                       if (t < __vxtime.last_tsc)
43944 +                               t = __vxtime.last_tsc;
43945 +                       usec += ((t - __vxtime.last_tsc) *
43946 +                                __vxtime.tsc_quot) >> 32;
43947 +                       /* See comment in x86_64 do_gettimeofday. */
43948 +               } else {
43949 +                       usec += ((readl((void __iomem *)
43950 +                                  fix_to_virt(VSYSCALL_HPET) + 0xf0) -
43951 +                                 __vxtime.last) * __vxtime.quot) >> 32;
43952 +               }
43953 +       } while (read_seqretry(&__xtime_lock, sequence));
43954 +
43955 +       tv->tv_sec = sec + usec / 1000000;
43956 +       tv->tv_usec = usec % 1000000;
43957 +}
43958 +
43959 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
43960 +static __always_inline void do_get_tz(struct timezone * tz)
43961 +{
43962 +       *tz = __sys_tz;
43963 +}
43964 +
43965 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
43966 +{
43967 +       int ret;
43968 +       asm volatile("vsysc2: syscall"
43969 +               : "=a" (ret)
43970 +               : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
43971 +       return ret;
43972 +}
43973 +
43974 +static __always_inline long time_syscall(long *t)
43975 +{
43976 +       long secs;
43977 +       asm volatile("vsysc1: syscall"
43978 +               : "=a" (secs)
43979 +               : "0" (__NR_time),"D" (t) : __syscall_clobber);
43980 +       return secs;
43981 +}
43982 +
43983 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
43984 +{
43985 +       if (!__sysctl_vsyscall)
43986 +               return gettimeofday(tv,tz);
43987 +       if (tv)
43988 +               do_vgettimeofday(tv);
43989 +       if (tz)
43990 +               do_get_tz(tz);
43991 +       return 0;
43992 +}
43993 +
43994 +/* This will break when the xtime seconds get inaccurate, but that is
43995 + * unlikely */
43996 +time_t __vsyscall(1) vtime(time_t *t)
43997 +{
43998 +       if (!__sysctl_vsyscall)
43999 +               return time_syscall(t);
44000 +       else if (t)
44001 +               *t = __xtime.tv_sec;            
44002 +       return __xtime.tv_sec;
44003 +}
44004 +
44005 +/* Fast way to get current CPU and node.
44006 +   This helps to do per node and per CPU caches in user space.
44007 +   The result is not guaranteed without CPU affinity, but usually
44008 +   works out because the scheduler tries to keep a thread on the same
44009 +   CPU.
44010 +
44011 +   tcache must point to a two element sized long array.
44012 +   All arguments can be NULL. */
44013 +long __vsyscall(2)
44014 +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
44015 +{
44016 +       unsigned int dummy, p;
44017 +       unsigned long j = 0;
44018 +
44019 +       /* Fast cache - only recompute value once per jiffies and avoid
44020 +          relatively costly rdtscp/cpuid otherwise.
44021 +          This works because the scheduler usually keeps the process
44022 +          on the same CPU and this syscall doesn't guarantee its
44023 +          results anyways.
44024 +          We do this here because otherwise user space would do it on
44025 +          its own in a likely inferior way (no access to jiffies).
44026 +          If you don't like it pass NULL. */
44027 +       if (tcache && tcache->blob[0] == (j = __jiffies)) {
44028 +               p = tcache->blob[1];
44029 +       } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
44030 +               /* Load per CPU data from RDTSCP */
44031 +               rdtscp(dummy, dummy, p);
44032 +       } else {
44033 +               /* Load per CPU data from GDT */
44034 +               asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
44035 +       }
44036 +       if (tcache) {
44037 +               tcache->blob[0] = j;
44038 +               tcache->blob[1] = p;
44039 +       }
44040 +       if (cpu)
44041 +               *cpu = p & 0xfff;
44042 +       if (node)
44043 +               *node = p >> 12;
44044 +       return 0;
44045 +}
44046 +
44047 +long __vsyscall(3) venosys_1(void)
44048 +{
44049 +       return -ENOSYS;
44050 +}
44051 +
44052 +#ifdef CONFIG_SYSCTL
44053 +
44054 +#define SYSCALL 0x050f
44055 +#define NOP2    0x9090
44056 +
44057 +/*
44058 + * NOP out syscall in vsyscall page when not needed.
44059 + */
44060 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
44061 +                        void __user *buffer, size_t *lenp, loff_t *ppos)
44062 +{
44063 +       extern u16 vsysc1, vsysc2;
44064 +       u16 __iomem *map1;
44065 +       u16 __iomem *map2;
44066 +       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
44067 +       if (!write)
44068 +               return ret;
44069 +       /* gcc has some trouble with __va(__pa()), so just do it this
44070 +          way. */
44071 +       map1 = ioremap(__pa_symbol(&vsysc1), 2);
44072 +       if (!map1)
44073 +               return -ENOMEM;
44074 +       map2 = ioremap(__pa_symbol(&vsysc2), 2);
44075 +       if (!map2) {
44076 +               ret = -ENOMEM;
44077 +               goto out;
44078 +       }
44079 +       if (!sysctl_vsyscall) {
44080 +               writew(SYSCALL, map1);
44081 +               writew(SYSCALL, map2);
44082 +       } else {
44083 +               writew(NOP2, map1);
44084 +               writew(NOP2, map2);
44085 +       }
44086 +       iounmap(map2);
44087 +out:
44088 +       iounmap(map1);
44089 +       return ret;
44090 +}
44091 +
44092 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
44093 +                               void __user *oldval, size_t __user *oldlenp,
44094 +                               void __user *newval, size_t newlen,
44095 +                               void **context)
44096 +{
44097 +       return -ENOSYS;
44098 +}
44099 +
44100 +static ctl_table kernel_table2[] = {
44101 +       { .ctl_name = 99, .procname = "vsyscall64",
44102 +         .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
44103 +         .strategy = vsyscall_sysctl_nostrat,
44104 +         .proc_handler = vsyscall_sysctl_change },
44105 +       { 0, }
44106 +};
44107 +
44108 +static ctl_table kernel_root_table2[] = {
44109 +       { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
44110 +         .child = kernel_table2 },
44111 +       { 0 },
44112 +};
44113 +
44114 +#endif
44115 +
44116 +/* Assume __initcall executes before all user space. Hopefully kmod
44117 +   doesn't violate that. We'll find out if it does. */
44118 +static void __cpuinit vsyscall_set_cpu(int cpu)
44119 +{
44120 +       unsigned long *d;
44121 +       unsigned long node = 0;
44122 +#ifdef CONFIG_NUMA
44123 +       node = cpu_to_node[cpu];
44124 +#endif
44125 +       if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
44126 +               write_rdtscp_aux((node << 12) | cpu);
44127 +
44128 +       /* Store cpu number in limit so that it can be loaded quickly
44129 +          in user space in vgetcpu.
44130 +          12 bits for the CPU and 8 bits for the node. */
44131 +       d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
44132 +       *d = 0x0f40000000000ULL;
44133 +       *d |= cpu;
44134 +       *d |= (node & 0xf) << 12;
44135 +       *d |= (node >> 4) << 48;
44136 +}
44137 +
44138 +static void __cpuinit cpu_vsyscall_init(void *arg)
44139 +{
44140 +       /* preemption should be already off */
44141 +       vsyscall_set_cpu(raw_smp_processor_id());
44142 +}
44143 +
44144 +#ifdef CONFIG_HOTPLUG_CPU
44145 +static int __cpuinit
44146 +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
44147 +{
44148 +       long cpu = (long)arg;
44149 +       if (action == CPU_ONLINE)
44150 +               smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
44151 +       return NOTIFY_DONE;
44152 +}
44153 +#endif
44154 +
44155 +static void __init map_vsyscall(void)
44156 +{
44157 +       extern char __vsyscall_0;
44158 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
44159 +
44160 +       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
44161 +}
44162 +
44163 +#ifdef CONFIG_XEN
44164 +static void __init map_vsyscall_user(void)
44165 +{
44166 +       extern void __set_fixmap_user(enum fixed_addresses, unsigned long, pgprot_t);
44167 +       extern char __vsyscall_0;
44168 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
44169 +
44170 +       __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
44171 +}
44172 +#endif
44173 +
44174 +static int __init vsyscall_init(void)
44175 +{
44176 +       BUG_ON(((unsigned long) &vgettimeofday !=
44177 +                       VSYSCALL_ADDR(__NR_vgettimeofday)));
44178 +       BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
44179 +       BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
44180 +       BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
44181 +       map_vsyscall();
44182 +#ifdef CONFIG_XEN
44183 +       map_vsyscall_user();
44184 +       sysctl_vsyscall = 0; /* disable vgettimeofay() */
44185 +#endif
44186 +#ifdef CONFIG_SYSCTL
44187 +       register_sysctl_table(kernel_root_table2, 0);
44188 +#endif
44189 +#ifndef CONFIG_XEN
44190 +       on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
44191 +       hotcpu_notifier(cpu_vsyscall_notifier, 0);
44192 +#endif
44193 +       return 0;
44194 +}
44195 +
44196 +__initcall(vsyscall_init);
44197 diff -ruNp linux-2.6.19/arch/x86_64/kernel/xen_entry.S linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/xen_entry.S
44198 --- linux-2.6.19/arch/x86_64/kernel/xen_entry.S 1970-01-01 00:00:00.000000000 +0000
44199 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/kernel/xen_entry.S       2007-02-02 19:10:27.000000000 +0000
44200 @@ -0,0 +1,40 @@
44201 +/*
44202 + * Copied from arch/xen/i386/kernel/entry.S
44203 + */                        
44204 +/* Offsets into shared_info_t. */                
44205 +#define evtchn_upcall_pending          /* 0 */
44206 +#define evtchn_upcall_mask             1
44207 +
44208 +#define sizeof_vcpu_shift              6
44209 +
44210 +#ifdef CONFIG_SMP
44211 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
44212 +//#define preempt_enable(reg)  decl threadinfo_preempt_count(reg)
44213 +#define preempt_disable(reg)
44214 +#define preempt_enable(reg)
44215 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp)                   ; \
44216 +                               movq %gs:pda_cpunumber,reg              ; \
44217 +                               shl  $32, reg                           ; \
44218 +                               shr  $32-sizeof_vcpu_shift,reg          ; \
44219 +                               addq HYPERVISOR_shared_info,reg
44220 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp)                    ; \
44221 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
44222 +#else
44223 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
44224 +#define XEN_PUT_VCPU_INFO(reg)
44225 +#define XEN_PUT_VCPU_INFO_fixup
44226 +#endif
44227 +
44228 +#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
44229 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
44230 +#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
44231 +                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
44232 +                               XEN_PUT_VCPU_INFO(reg)
44233 +#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  ; \
44234 +                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
44235 +                               XEN_PUT_VCPU_INFO(reg)
44236 +#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
44237 +
44238 +VGCF_IN_SYSCALL = (1<<8)
44239 +        
44240 +       
44241 diff -ruNp linux-2.6.19/arch/x86_64/mm/Makefile linux-2.6.19-xen-3.0.4/arch/x86_64/mm/Makefile
44242 --- linux-2.6.19/arch/x86_64/mm/Makefile        2006-11-29 21:57:37.000000000 +0000
44243 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/mm/Makefile      2007-02-02 19:10:27.000000000 +0000
44244 @@ -9,3 +9,13 @@ obj-$(CONFIG_K8_NUMA) += k8topology.o
44245  obj-$(CONFIG_ACPI_NUMA) += srat.o
44246  
44247  hugetlbpage-y = ../../i386/mm/hugetlbpage.o
44248 +
44249 +ifdef CONFIG_XEN
44250 +include $(srctree)/scripts/Makefile.xen
44251 +
44252 +ioremap-y      += ../../i386/mm/ioremap-xen.o
44253 +hypervisor-y   += ../../i386/mm/hypervisor.o
44254 +obj-y          += hypervisor.o
44255 +
44256 +obj-y := $(call cherrypickxen, $(obj-y))
44257 +endif
44258 diff -ruNp linux-2.6.19/arch/x86_64/mm/fault-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/mm/fault-xen.c
44259 --- linux-2.6.19/arch/x86_64/mm/fault-xen.c     1970-01-01 00:00:00.000000000 +0000
44260 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/mm/fault-xen.c   2007-02-02 19:10:27.000000000 +0000
44261 @@ -0,0 +1,717 @@
44262 +/*
44263 + *  linux/arch/x86-64/mm/fault.c
44264 + *
44265 + *  Copyright (C) 1995  Linus Torvalds
44266 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
44267 + */
44268 +
44269 +#include <linux/signal.h>
44270 +#include <linux/sched.h>
44271 +#include <linux/kernel.h>
44272 +#include <linux/errno.h>
44273 +#include <linux/string.h>
44274 +#include <linux/types.h>
44275 +#include <linux/ptrace.h>
44276 +#include <linux/mman.h>
44277 +#include <linux/mm.h>
44278 +#include <linux/smp.h>
44279 +#include <linux/smp_lock.h>
44280 +#include <linux/interrupt.h>
44281 +#include <linux/init.h>
44282 +#include <linux/tty.h>
44283 +#include <linux/vt_kern.h>             /* For unblank_screen() */
44284 +#include <linux/compiler.h>
44285 +#include <linux/module.h>
44286 +#include <linux/kprobes.h>
44287 +
44288 +#include <asm/system.h>
44289 +#include <asm/uaccess.h>
44290 +#include <asm/pgalloc.h>
44291 +#include <asm/smp.h>
44292 +#include <asm/tlbflush.h>
44293 +#include <asm/proto.h>
44294 +#include <asm/kdebug.h>
44295 +#include <asm-generic/sections.h>
44296 +
44297 +/* Page fault error code bits */
44298 +#define PF_PROT        (1<<0)          /* or no page found */
44299 +#define PF_WRITE       (1<<1)
44300 +#define PF_USER        (1<<2)
44301 +#define PF_RSVD        (1<<3)
44302 +#define PF_INSTR       (1<<4)
44303 +
44304 +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
44305 +
44306 +/* Hook to register for page fault notifications */
44307 +int register_page_fault_notifier(struct notifier_block *nb)
44308 +{
44309 +       vmalloc_sync_all();
44310 +       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
44311 +}
44312 +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
44313 +
44314 +int unregister_page_fault_notifier(struct notifier_block *nb)
44315 +{
44316 +       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
44317 +}
44318 +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
44319 +
44320 +static inline int notify_page_fault(enum die_val val, const char *str,
44321 +                       struct pt_regs *regs, long err, int trap, int sig)
44322 +{
44323 +       struct die_args args = {
44324 +               .regs = regs,
44325 +               .str = str,
44326 +               .err = err,
44327 +               .trapnr = trap,
44328 +               .signr = sig
44329 +       };
44330 +       return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
44331 +}
44332 +
44333 +void bust_spinlocks(int yes)
44334 +{
44335 +       int loglevel_save = console_loglevel;
44336 +       if (yes) {
44337 +               oops_in_progress = 1;
44338 +       } else {
44339 +#ifdef CONFIG_VT
44340 +               unblank_screen();
44341 +#endif
44342 +               oops_in_progress = 0;
44343 +               /*
44344 +                * OK, the message is on the console.  Now we call printk()
44345 +                * without oops_in_progress set so that printk will give klogd
44346 +                * a poke.  Hold onto your hats...
44347 +                */
44348 +               console_loglevel = 15;          /* NMI oopser may have shut the console up */
44349 +               printk(" ");
44350 +               console_loglevel = loglevel_save;
44351 +       }
44352 +}
44353 +
44354 +/* Sometimes the CPU reports invalid exceptions on prefetch.
44355 +   Check that here and ignore.
44356 +   Opcode checker based on code by Richard Brunner */
44357 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
44358 +                               unsigned long error_code)
44359 +{ 
44360 +       unsigned char __user *instr;
44361 +       int scan_more = 1;
44362 +       int prefetch = 0; 
44363 +       unsigned char *max_instr;
44364 +
44365 +       /* If it was a exec fault ignore */
44366 +       if (error_code & PF_INSTR)
44367 +               return 0;
44368 +       
44369 +       instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
44370 +       max_instr = instr + 15;
44371 +
44372 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
44373 +               return 0;
44374 +
44375 +       while (scan_more && instr < max_instr) { 
44376 +               unsigned char opcode;
44377 +               unsigned char instr_hi;
44378 +               unsigned char instr_lo;
44379 +
44380 +               if (__get_user(opcode, (char __user *)instr))
44381 +                       break; 
44382 +
44383 +               instr_hi = opcode & 0xf0; 
44384 +               instr_lo = opcode & 0x0f; 
44385 +               instr++;
44386 +
44387 +               switch (instr_hi) { 
44388 +               case 0x20:
44389 +               case 0x30:
44390 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
44391 +                          prefixes.  In long mode, the CPU will signal
44392 +                          invalid opcode if some of these prefixes are
44393 +                          present so we will never get here anyway */
44394 +                       scan_more = ((instr_lo & 7) == 0x6);
44395 +                       break;
44396 +                       
44397 +               case 0x40:
44398 +                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
44399 +                          Need to figure out under what instruction mode the
44400 +                          instruction was issued ... */
44401 +                       /* Could check the LDT for lm, but for now it's good
44402 +                          enough to assume that long mode only uses well known
44403 +                          segments or kernel. */
44404 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
44405 +                       break;
44406 +                       
44407 +               case 0x60:
44408 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
44409 +                       scan_more = (instr_lo & 0xC) == 0x4;
44410 +                       break;          
44411 +               case 0xF0:
44412 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
44413 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
44414 +                       break;                  
44415 +               case 0x00:
44416 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
44417 +                       scan_more = 0;
44418 +                       if (__get_user(opcode, (char __user *)instr))
44419 +                               break;
44420 +                       prefetch = (instr_lo == 0xF) &&
44421 +                               (opcode == 0x0D || opcode == 0x18);
44422 +                       break;                  
44423 +               default:
44424 +                       scan_more = 0;
44425 +                       break;
44426 +               } 
44427 +       }
44428 +       return prefetch;
44429 +}
44430 +
44431 +static int bad_address(void *p) 
44432 +{ 
44433 +       unsigned long dummy;
44434 +       return __get_user(dummy, (unsigned long __user *)p);
44435 +} 
44436 +
44437 +void dump_pagetable(unsigned long address)
44438 +{
44439 +       pgd_t *pgd;
44440 +       pud_t *pud;
44441 +       pmd_t *pmd;
44442 +       pte_t *pte;
44443 +
44444 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
44445 +       pgd += pgd_index(address);
44446 +       if (bad_address(pgd)) goto bad;
44447 +       printk("PGD %lx ", pgd_val(*pgd));
44448 +       if (!pgd_present(*pgd)) goto ret; 
44449 +
44450 +       pud = pud_offset(pgd, address);
44451 +       if (bad_address(pud)) goto bad;
44452 +       printk("PUD %lx ", pud_val(*pud));
44453 +       if (!pud_present(*pud)) goto ret;
44454 +
44455 +       pmd = pmd_offset(pud, address);
44456 +       if (bad_address(pmd)) goto bad;
44457 +       printk("PMD %lx ", pmd_val(*pmd));
44458 +       if (!pmd_present(*pmd)) goto ret;        
44459 +
44460 +       pte = pte_offset_kernel(pmd, address);
44461 +       if (bad_address(pte)) goto bad;
44462 +       printk("PTE %lx", pte_val(*pte)); 
44463 +ret:
44464 +       printk("\n");
44465 +       return;
44466 +bad:
44467 +       printk("BAD\n");
44468 +}
44469 +
44470 +static const char errata93_warning[] = 
44471 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
44472 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
44473 +KERN_ERR "******* Please consider a BIOS update.\n"
44474 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
44475 +
44476 +/* Workaround for K8 erratum #93 & buggy BIOS.
44477 +   BIOS SMM functions are required to use a specific workaround
44478 +   to avoid corruption of the 64bit RIP register on C stepping K8. 
44479 +   A lot of BIOS that didn't get tested properly miss this. 
44480 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
44481 +   Try to work around it here.
44482 +   Note we only handle faults in kernel here. */
44483 +
44484 +static int is_errata93(struct pt_regs *regs, unsigned long address) 
44485 +{
44486 +       static int warned;
44487 +       if (address != regs->rip)
44488 +               return 0;
44489 +       if ((address >> 32) != 0) 
44490 +               return 0;
44491 +       address |= 0xffffffffUL << 32;
44492 +       if ((address >= (u64)_stext && address <= (u64)_etext) || 
44493 +           (address >= MODULES_VADDR && address <= MODULES_END)) { 
44494 +               if (!warned) {
44495 +                       printk(errata93_warning);               
44496 +                       warned = 1;
44497 +               }
44498 +               regs->rip = address;
44499 +               return 1;
44500 +       }
44501 +       return 0;
44502 +} 
44503 +
44504 +int unhandled_signal(struct task_struct *tsk, int sig)
44505 +{
44506 +       if (is_init(tsk))
44507 +               return 1;
44508 +       if (tsk->ptrace & PT_PTRACED)
44509 +               return 0;
44510 +       return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
44511 +               (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
44512 +}
44513 +
44514 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
44515 +                                unsigned long error_code)
44516 +{
44517 +       unsigned long flags = oops_begin();
44518 +       struct task_struct *tsk;
44519 +
44520 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
44521 +              current->comm, address);
44522 +       dump_pagetable(address);
44523 +       tsk = current;
44524 +       tsk->thread.cr2 = address;
44525 +       tsk->thread.trap_no = 14;
44526 +       tsk->thread.error_code = error_code;
44527 +       __die("Bad pagetable", regs, error_code);
44528 +       oops_end(flags);
44529 +       do_exit(SIGKILL);
44530 +}
44531 +
44532 +/*
44533 + * Handle a fault on the vmalloc area
44534 + *
44535 + * This assumes no large pages in there.
44536 + */
44537 +static int vmalloc_fault(unsigned long address)
44538 +{
44539 +       pgd_t *pgd, *pgd_ref;
44540 +       pud_t *pud, *pud_ref;
44541 +       pmd_t *pmd, *pmd_ref;
44542 +       pte_t *pte, *pte_ref;
44543 +
44544 +       /* Copy kernel mappings over when needed. This can also
44545 +          happen within a race in page table update. In the later
44546 +          case just flush. */
44547 +
44548 +       /* On Xen the line below does not always work. Needs investigating! */
44549 +       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
44550 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
44551 +       pgd += pgd_index(address);
44552 +       pgd_ref = pgd_offset_k(address);
44553 +       if (pgd_none(*pgd_ref))
44554 +               return -1;
44555 +       if (pgd_none(*pgd))
44556 +               set_pgd(pgd, *pgd_ref);
44557 +       else
44558 +               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
44559 +
44560 +       /* Below here mismatches are bugs because these lower tables
44561 +          are shared */
44562 +
44563 +       pud = pud_offset(pgd, address);
44564 +       pud_ref = pud_offset(pgd_ref, address);
44565 +       if (pud_none(*pud_ref))
44566 +               return -1;
44567 +       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
44568 +               BUG();
44569 +       pmd = pmd_offset(pud, address);
44570 +       pmd_ref = pmd_offset(pud_ref, address);
44571 +       if (pmd_none(*pmd_ref))
44572 +               return -1;
44573 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
44574 +               BUG();
44575 +       pte_ref = pte_offset_kernel(pmd_ref, address);
44576 +       if (!pte_present(*pte_ref))
44577 +               return -1;
44578 +       pte = pte_offset_kernel(pmd, address);
44579 +       /* Don't use pte_page here, because the mappings can point
44580 +          outside mem_map, and the NUMA hash lookup cannot handle
44581 +          that. */
44582 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
44583 +               BUG();
44584 +       return 0;
44585 +}
44586 +
44587 +int page_fault_trace = 0;
44588 +int exception_trace = 1;
44589 +
44590 +
44591 +#define MEM_VERBOSE 1
44592 +
44593 +#ifdef MEM_VERBOSE
44594 +#define MEM_LOG(_f, _a...)                     \
44595 +       printk("fault.c:[%d]-> " _f "\n",       \
44596 +       __LINE__ , ## _a )
44597 +#else
44598 +#define MEM_LOG(_f, _a...) ((void)0)
44599 +#endif
44600 +
44601 +static int spurious_fault(struct pt_regs *regs,
44602 +                         unsigned long address,
44603 +                         unsigned long error_code)
44604 +{
44605 +       pgd_t *pgd;
44606 +       pud_t *pud;
44607 +       pmd_t *pmd;
44608 +       pte_t *pte;
44609 +
44610 +#ifdef CONFIG_XEN
44611 +       /* Faults in hypervisor area are never spurious. */
44612 +       if ((address >= HYPERVISOR_VIRT_START) &&
44613 +           (address < HYPERVISOR_VIRT_END))
44614 +               return 0;
44615 +#endif
44616 +
44617 +       /* Reserved-bit violation or user access to kernel space? */
44618 +       if (error_code & (PF_RSVD|PF_USER))
44619 +               return 0;
44620 +
44621 +       pgd = init_mm.pgd + pgd_index(address);
44622 +       if (!pgd_present(*pgd))
44623 +               return 0;
44624 +
44625 +       pud = pud_offset(pgd, address);
44626 +       if (!pud_present(*pud))
44627 +               return 0;
44628 +
44629 +       pmd = pmd_offset(pud, address);
44630 +       if (!pmd_present(*pmd))
44631 +               return 0;
44632 +
44633 +       pte = pte_offset_kernel(pmd, address);
44634 +       if (!pte_present(*pte))
44635 +               return 0;
44636 +       if ((error_code & PF_WRITE) && !pte_write(*pte))
44637 +               return 0;
44638 +       if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
44639 +               return 0;
44640 +
44641 +       return 1;
44642 +}
44643 +
44644 +/*
44645 + * This routine handles page faults.  It determines the address,
44646 + * and the problem, and then passes it off to one of the appropriate
44647 + * routines.
44648 + */
44649 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
44650 +                                       unsigned long error_code)
44651 +{
44652 +       struct task_struct *tsk;
44653 +       struct mm_struct *mm;
44654 +       struct vm_area_struct * vma;
44655 +       unsigned long address;
44656 +       const struct exception_table_entry *fixup;
44657 +       int write;
44658 +       unsigned long flags;
44659 +       siginfo_t info;
44660 +
44661 +       if (!user_mode(regs))
44662 +               error_code &= ~PF_USER; /* means kernel */
44663 +
44664 +       tsk = current;
44665 +       mm = tsk->mm;
44666 +       prefetchw(&mm->mmap_sem);
44667 +
44668 +       /* get the address */
44669 +       address = HYPERVISOR_shared_info->vcpu_info[
44670 +               smp_processor_id()].arch.cr2;
44671 +
44672 +       info.si_code = SEGV_MAPERR;
44673 +
44674 +
44675 +       /*
44676 +        * We fault-in kernel-space virtual memory on-demand. The
44677 +        * 'reference' page table is init_mm.pgd.
44678 +        *
44679 +        * NOTE! We MUST NOT take any locks for this case. We may
44680 +        * be in an interrupt or a critical region, and should
44681 +        * only copy the information from the master page table,
44682 +        * nothing more.
44683 +        *
44684 +        * This verifies that the fault happens in kernel space
44685 +        * (error_code & 4) == 0, and that the fault was not a
44686 +        * protection error (error_code & 9) == 0.
44687 +        */
44688 +       if (unlikely(address >= TASK_SIZE64)) {
44689 +               /*
44690 +                * Don't check for the module range here: its PML4
44691 +                * is always initialized because it's shared with the main
44692 +                * kernel text. Only vmalloc may need PML4 syncups.
44693 +                */
44694 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
44695 +                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
44696 +                       if (vmalloc_fault(address) >= 0)
44697 +                               return;
44698 +               }
44699 +               if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
44700 +                                               SIGSEGV) == NOTIFY_STOP)
44701 +                       return;
44702 +               /* Can take a spurious fault if mapping changes R/O -> R/W. */
44703 +               if (spurious_fault(regs, address, error_code))
44704 +                       return;
44705 +               /*
44706 +                * Don't take the mm semaphore here. If we fixup a prefetch
44707 +                * fault we could otherwise deadlock.
44708 +                */
44709 +               goto bad_area_nosemaphore;
44710 +       }
44711 +
44712 +       if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
44713 +                                       SIGSEGV) == NOTIFY_STOP)
44714 +               return;
44715 +
44716 +       if (likely(regs->eflags & X86_EFLAGS_IF))
44717 +               local_irq_enable();
44718 +
44719 +       if (unlikely(page_fault_trace))
44720 +               printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
44721 +                      regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
44722 +
44723 +       if (unlikely(error_code & PF_RSVD))
44724 +               pgtable_bad(address, regs, error_code);
44725 +
44726 +       /*
44727 +        * If we're in an interrupt or have no user
44728 +        * context, we must not take the fault..
44729 +        */
44730 +       if (unlikely(in_atomic() || !mm))
44731 +               goto bad_area_nosemaphore;
44732 +
44733 + again:
44734 +       /* When running in the kernel we expect faults to occur only to
44735 +        * addresses in user space.  All other faults represent errors in the
44736 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
44737 +        * erroneous fault occurring in a code path which already holds mmap_sem
44738 +        * we will deadlock attempting to validate the fault against the
44739 +        * address space.  Luckily the kernel only validly references user
44740 +        * space from well defined areas of code, which are listed in the
44741 +        * exceptions table.
44742 +        *
44743 +        * As the vast majority of faults will be valid we will only perform
44744 +        * the source reference check when there is a possibilty of a deadlock.
44745 +        * Attempt to lock the address space, if we cannot we then validate the
44746 +        * source.  If this is invalid we can skip the address space check,
44747 +        * thus avoiding the deadlock.
44748 +        */
44749 +       if (!down_read_trylock(&mm->mmap_sem)) {
44750 +               if ((error_code & PF_USER) == 0 &&
44751 +                   !search_exception_tables(regs->rip))
44752 +                       goto bad_area_nosemaphore;
44753 +               down_read(&mm->mmap_sem);
44754 +       }
44755 +
44756 +       vma = find_vma(mm, address);
44757 +       if (!vma)
44758 +               goto bad_area;
44759 +       if (likely(vma->vm_start <= address))
44760 +               goto good_area;
44761 +       if (!(vma->vm_flags & VM_GROWSDOWN))
44762 +               goto bad_area;
44763 +       if (error_code & 4) {
44764 +               // XXX: align red zone size with ABI 
44765 +               if (address + 128 < regs->rsp)
44766 +                       goto bad_area;
44767 +       }
44768 +       if (expand_stack(vma, address))
44769 +               goto bad_area;
44770 +/*
44771 + * Ok, we have a good vm_area for this memory access, so
44772 + * we can handle it..
44773 + */
44774 +good_area:
44775 +       info.si_code = SEGV_ACCERR;
44776 +       write = 0;
44777 +       switch (error_code & (PF_PROT|PF_WRITE)) {
44778 +               default:        /* 3: write, present */
44779 +                       /* fall through */
44780 +               case PF_WRITE:          /* write, not present */
44781 +                       if (!(vma->vm_flags & VM_WRITE))
44782 +                               goto bad_area;
44783 +                       write++;
44784 +                       break;
44785 +               case PF_PROT:           /* read, present */
44786 +                       goto bad_area;
44787 +               case 0:                 /* read, not present */
44788 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
44789 +                               goto bad_area;
44790 +       }
44791 +
44792 +       /*
44793 +        * If for any reason at all we couldn't handle the fault,
44794 +        * make sure we exit gracefully rather than endlessly redo
44795 +        * the fault.
44796 +        */
44797 +       switch (handle_mm_fault(mm, vma, address, write)) {
44798 +       case VM_FAULT_MINOR:
44799 +               tsk->min_flt++;
44800 +               break;
44801 +       case VM_FAULT_MAJOR:
44802 +               tsk->maj_flt++;
44803 +               break;
44804 +       case VM_FAULT_SIGBUS:
44805 +               goto do_sigbus;
44806 +       default:
44807 +               goto out_of_memory;
44808 +       }
44809 +
44810 +       up_read(&mm->mmap_sem);
44811 +       return;
44812 +
44813 +/*
44814 + * Something tried to access memory that isn't in our memory map..
44815 + * Fix it, but check if it's kernel or user first..
44816 + */
44817 +bad_area:
44818 +       up_read(&mm->mmap_sem);
44819 +
44820 +bad_area_nosemaphore:
44821 +       /* User mode accesses just cause a SIGSEGV */
44822 +       if (error_code & PF_USER) {
44823 +               if (is_prefetch(regs, address, error_code))
44824 +                       return;
44825 +
44826 +               /* Work around K8 erratum #100 K8 in compat mode
44827 +                  occasionally jumps to illegal addresses >4GB.  We
44828 +                  catch this here in the page fault handler because
44829 +                  these addresses are not reachable. Just detect this
44830 +                  case and return.  Any code segment in LDT is
44831 +                  compatibility mode. */
44832 +               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
44833 +                   (address >> 32))
44834 +                       return;
44835 +
44836 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
44837 +                       printk(
44838 +                      "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
44839 +                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
44840 +                                       tsk->comm, tsk->pid, address, regs->rip,
44841 +                                       regs->rsp, error_code);
44842 +               }
44843 +       
44844 +               tsk->thread.cr2 = address;
44845 +               /* Kernel addresses are always protection faults */
44846 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
44847 +               tsk->thread.trap_no = 14;
44848 +               info.si_signo = SIGSEGV;
44849 +               info.si_errno = 0;
44850 +               /* info.si_code has been set above */
44851 +               info.si_addr = (void __user *)address;
44852 +               force_sig_info(SIGSEGV, &info, tsk);
44853 +               return;
44854 +       }
44855 +
44856 +no_context:
44857 +       
44858 +       /* Are we prepared to handle this kernel fault?  */
44859 +       fixup = search_exception_tables(regs->rip);
44860 +       if (fixup) {
44861 +               regs->rip = fixup->fixup;
44862 +               return;
44863 +       }
44864 +
44865 +       /* 
44866 +        * Hall of shame of CPU/BIOS bugs.
44867 +        */
44868 +
44869 +       if (is_prefetch(regs, address, error_code))
44870 +               return;
44871 +
44872 +       if (is_errata93(regs, address))
44873 +               return; 
44874 +
44875 +/*
44876 + * Oops. The kernel tried to access some bad page. We'll have to
44877 + * terminate things with extreme prejudice.
44878 + */
44879 +
44880 +       flags = oops_begin();
44881 +
44882 +       if (address < PAGE_SIZE)
44883 +               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
44884 +       else
44885 +               printk(KERN_ALERT "Unable to handle kernel paging request");
44886 +       printk(" at %016lx RIP: \n" KERN_ALERT,address);
44887 +       printk_address(regs->rip);
44888 +       dump_pagetable(address);
44889 +       tsk->thread.cr2 = address;
44890 +       tsk->thread.trap_no = 14;
44891 +       tsk->thread.error_code = error_code;
44892 +       __die("Oops", regs, error_code);
44893 +       /* Executive summary in case the body of the oops scrolled away */
44894 +       printk(KERN_EMERG "CR2: %016lx\n", address);
44895 +       oops_end(flags);
44896 +       do_exit(SIGKILL);
44897 +
44898 +/*
44899 + * We ran out of memory, or some other thing happened to us that made
44900 + * us unable to handle the page fault gracefully.
44901 + */
44902 +out_of_memory:
44903 +       up_read(&mm->mmap_sem);
44904 +       if (is_init(current)) {
44905 +               yield();
44906 +               goto again;
44907 +       }
44908 +       printk("VM: killing process %s\n", tsk->comm);
44909 +       if (error_code & 4)
44910 +               do_exit(SIGKILL);
44911 +       goto no_context;
44912 +
44913 +do_sigbus:
44914 +       up_read(&mm->mmap_sem);
44915 +
44916 +       /* Kernel mode? Handle exceptions or die */
44917 +       if (!(error_code & PF_USER))
44918 +               goto no_context;
44919 +
44920 +       tsk->thread.cr2 = address;
44921 +       tsk->thread.error_code = error_code;
44922 +       tsk->thread.trap_no = 14;
44923 +       info.si_signo = SIGBUS;
44924 +       info.si_errno = 0;
44925 +       info.si_code = BUS_ADRERR;
44926 +       info.si_addr = (void __user *)address;
44927 +       force_sig_info(SIGBUS, &info, tsk);
44928 +       return;
44929 +}
44930 +
44931 +DEFINE_SPINLOCK(pgd_lock);
44932 +struct page *pgd_list;
44933 +
44934 +void vmalloc_sync_all(void)
44935 +{
44936 +       /* Note that races in the updates of insync and start aren't 
44937 +          problematic:
44938 +          insync can only get set bits added, and updates to start are only
44939 +          improving performance (without affecting correctness if undone). */
44940 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
44941 +       static unsigned long start = VMALLOC_START & PGDIR_MASK;
44942 +       unsigned long address;
44943 +
44944 +       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
44945 +               if (!test_bit(pgd_index(address), insync)) {
44946 +                       const pgd_t *pgd_ref = pgd_offset_k(address);
44947 +                       struct page *page;
44948 +
44949 +                       if (pgd_none(*pgd_ref))
44950 +                               continue;
44951 +                       spin_lock(&pgd_lock);
44952 +                       for (page = pgd_list; page;
44953 +                            page = (struct page *)page->index) {
44954 +                               pgd_t *pgd;
44955 +                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
44956 +                               if (pgd_none(*pgd))
44957 +                                       set_pgd(pgd, *pgd_ref);
44958 +                               else
44959 +                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
44960 +                       }
44961 +                       spin_unlock(&pgd_lock);
44962 +                       set_bit(pgd_index(address), insync);
44963 +               }
44964 +               if (address == start)
44965 +                       start = address + PGDIR_SIZE;
44966 +       }
44967 +       /* Check that there is no need to do the same for the modules area. */
44968 +       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
44969 +       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
44970 +                               (__START_KERNEL & PGDIR_MASK)));
44971 +}
44972 +
44973 +static int __init enable_pagefaulttrace(char *str)
44974 +{
44975 +       page_fault_trace = 1;
44976 +       return 1;
44977 +}
44978 +__setup("pagefaulttrace", enable_pagefaulttrace);
44979 diff -ruNp linux-2.6.19/arch/x86_64/mm/init-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/mm/init-xen.c
44980 --- linux-2.6.19/arch/x86_64/mm/init-xen.c      1970-01-01 00:00:00.000000000 +0000
44981 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/mm/init-xen.c    2007-02-02 19:10:27.000000000 +0000
44982 @@ -0,0 +1,1182 @@
44983 +/*
44984 + *  linux/arch/x86_64/mm/init.c
44985 + *
44986 + *  Copyright (C) 1995  Linus Torvalds
44987 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
44988 + *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
44989 + *
44990 + *  Jun Nakajima <jun.nakajima@intel.com>
44991 + *     Modified for Xen.
44992 + */
44993 +
44994 +#include <linux/signal.h>
44995 +#include <linux/sched.h>
44996 +#include <linux/kernel.h>
44997 +#include <linux/errno.h>
44998 +#include <linux/string.h>
44999 +#include <linux/types.h>
45000 +#include <linux/ptrace.h>
45001 +#include <linux/mman.h>
45002 +#include <linux/mm.h>
45003 +#include <linux/swap.h>
45004 +#include <linux/smp.h>
45005 +#include <linux/init.h>
45006 +#include <linux/pagemap.h>
45007 +#include <linux/bootmem.h>
45008 +#include <linux/proc_fs.h>
45009 +#include <linux/pci.h>
45010 +#include <linux/poison.h>
45011 +#include <linux/dma-mapping.h>
45012 +#include <linux/module.h>
45013 +#include <linux/memory_hotplug.h>
45014 +
45015 +#include <asm/processor.h>
45016 +#include <asm/system.h>
45017 +#include <asm/uaccess.h>
45018 +#include <asm/pgtable.h>
45019 +#include <asm/pgalloc.h>
45020 +#include <asm/dma.h>
45021 +#include <asm/fixmap.h>
45022 +#include <asm/e820.h>
45023 +#include <asm/apic.h>
45024 +#include <asm/tlb.h>
45025 +#include <asm/mmu_context.h>
45026 +#include <asm/proto.h>
45027 +#include <asm/smp.h>
45028 +#include <asm/sections.h>
45029 +
45030 +#include <xen/features.h>
45031 +
45032 +#ifndef Dprintk
45033 +#define Dprintk(x...)
45034 +#endif
45035 +
45036 +struct dma_mapping_ops* dma_ops;
45037 +EXPORT_SYMBOL(dma_ops);
45038 +
45039 +#ifdef CONFIG_XEN_COMPAT_030002
45040 +unsigned int __kernel_page_user;
45041 +EXPORT_SYMBOL(__kernel_page_user);
45042 +#endif
45043 +
45044 +extern unsigned long *contiguous_bitmap;
45045 +
45046 +static unsigned long dma_reserve __initdata;
45047 +
45048 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
45049 +extern unsigned long start_pfn;
45050 +
45051 +/*
45052 + * Use this until direct mapping is established, i.e. before __va() is 
45053 + * available in init_memory_mapping().
45054 + */
45055 +
45056 +#define addr_to_page(addr, page)                               \
45057 +       (addr) &= PHYSICAL_PAGE_MASK;                           \
45058 +       (page) = ((unsigned long *) ((unsigned long)            \
45059 +       (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
45060 +       __START_KERNEL_map)))
45061 +
45062 +static void early_make_page_readonly(void *va, unsigned int feature)
45063 +{
45064 +       unsigned long addr, _va = (unsigned long)va;
45065 +       pte_t pte, *ptep;
45066 +       unsigned long *page = (unsigned long *) init_level4_pgt;
45067 +
45068 +       if (xen_feature(feature))
45069 +               return;
45070 +
45071 +       addr = (unsigned long) page[pgd_index(_va)];
45072 +       addr_to_page(addr, page);
45073 +
45074 +       addr = page[pud_index(_va)];
45075 +       addr_to_page(addr, page);
45076 +
45077 +       addr = page[pmd_index(_va)];
45078 +       addr_to_page(addr, page);
45079 +
45080 +       ptep = (pte_t *) &page[pte_index(_va)];
45081 +
45082 +       pte.pte = ptep->pte & ~_PAGE_RW;
45083 +       if (HYPERVISOR_update_va_mapping(_va, pte, 0))
45084 +               BUG();
45085 +}
45086 +
45087 +void make_page_readonly(void *va, unsigned int feature)
45088 +{
45089 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
45090 +       unsigned long addr = (unsigned long) va;
45091 +
45092 +       if (xen_feature(feature))
45093 +               return;
45094 +
45095 +       pgd = pgd_offset_k(addr);
45096 +       pud = pud_offset(pgd, addr);
45097 +       pmd = pmd_offset(pud, addr);
45098 +       ptep = pte_offset_kernel(pmd, addr);
45099 +
45100 +       pte.pte = ptep->pte & ~_PAGE_RW;
45101 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
45102 +               xen_l1_entry_update(ptep, pte); /* fallback */
45103 +
45104 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
45105 +               make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
45106 +}
45107 +
45108 +void make_page_writable(void *va, unsigned int feature)
45109 +{
45110 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
45111 +       unsigned long addr = (unsigned long) va;
45112 +
45113 +       if (xen_feature(feature))
45114 +               return;
45115 +
45116 +       pgd = pgd_offset_k(addr);
45117 +       pud = pud_offset(pgd, addr);
45118 +       pmd = pmd_offset(pud, addr);
45119 +       ptep = pte_offset_kernel(pmd, addr);
45120 +
45121 +       pte.pte = ptep->pte | _PAGE_RW;
45122 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
45123 +               xen_l1_entry_update(ptep, pte); /* fallback */
45124 +
45125 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
45126 +               make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
45127 +}
45128 +
45129 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
45130 +{
45131 +       if (xen_feature(feature))
45132 +               return;
45133 +
45134 +       while (nr-- != 0) {
45135 +               make_page_readonly(va, feature);
45136 +               va = (void*)((unsigned long)va + PAGE_SIZE);
45137 +       }
45138 +}
45139 +
45140 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
45141 +{
45142 +       if (xen_feature(feature))
45143 +               return;
45144 +
45145 +       while (nr-- != 0) {
45146 +               make_page_writable(va, feature);
45147 +               va = (void*)((unsigned long)va + PAGE_SIZE);
45148 +       }
45149 +}
45150 +
45151 +/*
45152 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
45153 + * physical space so we can cache the place of the first one and move
45154 + * around without checking the pgd every time.
45155 + */
45156 +
45157 +void show_mem(void)
45158 +{
45159 +       long i, total = 0, reserved = 0;
45160 +       long shared = 0, cached = 0;
45161 +       pg_data_t *pgdat;
45162 +       struct page *page;
45163 +
45164 +       printk(KERN_INFO "Mem-info:\n");
45165 +       show_free_areas();
45166 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
45167 +
45168 +       for_each_online_pgdat(pgdat) {
45169 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
45170 +                       page = pfn_to_page(pgdat->node_start_pfn + i);
45171 +                       total++;
45172 +                       if (PageReserved(page))
45173 +                               reserved++;
45174 +                       else if (PageSwapCache(page))
45175 +                               cached++;
45176 +                       else if (page_count(page))
45177 +                               shared += page_count(page) - 1;
45178 +               }
45179 +       }
45180 +       printk(KERN_INFO "%lu pages of RAM\n", total);
45181 +       printk(KERN_INFO "%lu reserved pages\n",reserved);
45182 +       printk(KERN_INFO "%lu pages shared\n",shared);
45183 +       printk(KERN_INFO "%lu pages swap cached\n",cached);
45184 +}
45185 +
45186 +int after_bootmem;
45187 +
45188 +static __init void *spp_getpage(void)
45189 +{ 
45190 +       void *ptr;
45191 +       if (after_bootmem)
45192 +               ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
45193 +       else
45194 +               ptr = alloc_bootmem_pages(PAGE_SIZE);
45195 +       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
45196 +               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
45197 +
45198 +       Dprintk("spp_getpage %p\n", ptr);
45199 +       return ptr;
45200 +} 
45201 +
45202 +#define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
45203 +
45204 +static inline pud_t *pud_offset_u(unsigned long address)
45205 +{
45206 +       pud_t *pud = level3_user_pgt;
45207 +
45208 +       return pud + pud_index(address);
45209 +}
45210 +
45211 +static __init void set_pte_phys(unsigned long vaddr,
45212 +                        unsigned long phys, pgprot_t prot, int user_mode)
45213 +{
45214 +       pgd_t *pgd;
45215 +       pud_t *pud;
45216 +       pmd_t *pmd;
45217 +       pte_t *pte, new_pte;
45218 +
45219 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
45220 +
45221 +       pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
45222 +       if (pgd_none(*pgd)) {
45223 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
45224 +               return;
45225 +       }
45226 +       pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
45227 +       if (pud_none(*pud)) {
45228 +               pmd = (pmd_t *) spp_getpage(); 
45229 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
45230 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
45231 +               if (pmd != pmd_offset(pud, 0)) {
45232 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
45233 +                       return;
45234 +               }
45235 +       }
45236 +       pmd = pmd_offset(pud, vaddr);
45237 +       if (pmd_none(*pmd)) {
45238 +               pte = (pte_t *) spp_getpage();
45239 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
45240 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
45241 +               if (pte != pte_offset_kernel(pmd, 0)) {
45242 +                       printk("PAGETABLE BUG #02!\n");
45243 +                       return;
45244 +               }
45245 +       }
45246 +       if (pgprot_val(prot))
45247 +               new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
45248 +       else
45249 +               new_pte = __pte(0);
45250 +
45251 +       pte = pte_offset_kernel(pmd, vaddr);
45252 +       if (!pte_none(*pte) &&
45253 +           pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
45254 +               pte_ERROR(*pte);
45255 +       set_pte(pte, new_pte);
45256 +
45257 +       /*
45258 +        * It's enough to flush this one mapping.
45259 +        * (PGE mappings get flushed as well)
45260 +        */
45261 +       __flush_tlb_one(vaddr);
45262 +}
45263 +
45264 +static void set_pte_phys_ma(unsigned long vaddr,
45265 +                        unsigned long phys, pgprot_t prot)
45266 +{
45267 +       pgd_t *pgd;
45268 +       pud_t *pud;
45269 +       pmd_t *pmd;
45270 +       pte_t *pte, new_pte;
45271 +
45272 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
45273 +
45274 +       pgd = pgd_offset_k(vaddr);
45275 +       if (pgd_none(*pgd)) {
45276 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
45277 +               return;
45278 +       }
45279 +       pud = pud_offset(pgd, vaddr);
45280 +       if (pud_none(*pud)) {
45281 +
45282 +               pmd = (pmd_t *) spp_getpage(); 
45283 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
45284 +
45285 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
45286 +
45287 +               if (pmd != pmd_offset(pud, 0)) {
45288 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
45289 +                       return;
45290 +               }
45291 +       }
45292 +       pmd = pmd_offset(pud, vaddr);
45293 +
45294 +       if (pmd_none(*pmd)) {
45295 +               pte = (pte_t *) spp_getpage();
45296 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
45297 +
45298 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
45299 +               if (pte != pte_offset_kernel(pmd, 0)) {
45300 +                       printk("PAGETABLE BUG #02!\n");
45301 +                       return;
45302 +               }
45303 +       }
45304 +
45305 +       new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
45306 +       pte = pte_offset_kernel(pmd, vaddr);
45307 +
45308 +       /* 
45309 +        * Note that the pte page is already RO, thus we want to use
45310 +        * xen_l1_entry_update(), not set_pte().
45311 +        */
45312 +       xen_l1_entry_update(pte, 
45313 +                           pfn_pte_ma(phys >> PAGE_SHIFT, prot));
45314 +
45315 +       /*
45316 +        * It's enough to flush this one mapping.
45317 +        * (PGE mappings get flushed as well)
45318 +        */
45319 +       __flush_tlb_one(vaddr);
45320 +}
45321 +
45322 +#define SET_FIXMAP_KERNEL 0
45323 +#define SET_FIXMAP_USER   1
45324 +
45325 +/* NOTE: this is meant to be run only at boot */
45326 +void __init 
45327 +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
45328 +{
45329 +       unsigned long address = __fix_to_virt(idx);
45330 +
45331 +       if (idx >= __end_of_fixed_addresses) {
45332 +               printk("Invalid __set_fixmap\n");
45333 +               return;
45334 +       }
45335 +       switch (idx) {
45336 +       case VSYSCALL_FIRST_PAGE:
45337 +               set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
45338 +               break;
45339 +       default:
45340 +               set_pte_phys_ma(address, phys, prot);
45341 +               break;
45342 +       }
45343 +}
45344 +
45345 +/*
45346 + * At this point it only supports vsyscall area.
45347 + */
45348 +void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
45349 +{
45350 +       unsigned long address = __fix_to_virt(idx);
45351 +
45352 +       if (idx >= __end_of_fixed_addresses) {
45353 +               printk("Invalid __set_fixmap\n");
45354 +               return;
45355 +       }
45356 +
45357 +       set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
45358 +}
45359 +
45360 +unsigned long __initdata table_start, table_end; 
45361 +
45362 +#ifndef CONFIG_XEN
45363 +extern pmd_t temp_boot_pmds[]; 
45364 +
45365 +static  struct temp_map { 
45366 +       pmd_t *pmd;
45367 +       void  *address; 
45368 +       int    allocated; 
45369 +} temp_mappings[] __initdata = { 
45370 +       { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
45371 +       { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
45372 +       {}
45373 +}; 
45374 +#endif /* !CONFIG_XEN */
45375 +
45376 +unsigned long get_machine_pfn(unsigned long addr)
45377 +{
45378 +       pud_t* pud = pud_offset_k(NULL, addr);
45379 +       pmd_t* pmd = pmd_offset(pud, addr);
45380 +       pte_t *pte = pte_offset_kernel(pmd, addr);
45381 +
45382 +       return pte_mfn(*pte);
45383 +} 
45384 +
45385 +static __meminit void *alloc_static_page(unsigned long *phys)
45386 +{
45387 +       unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
45388 +
45389 +       if (after_bootmem) {
45390 +               void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
45391 +               *phys = __pa(adr);
45392 +               return adr;
45393 +       }
45394 +
45395 +       *phys = start_pfn << PAGE_SHIFT;
45396 +       start_pfn++;
45397 +       memset((void *)va, 0, PAGE_SIZE);
45398 +       return (void *)va;
45399 +} 
45400 +
45401 +#define PTE_SIZE PAGE_SIZE
45402 +
45403 +static inline void __set_pte(pte_t *dst, pte_t val)
45404 +{
45405 +       *dst = val;
45406 +}
45407 +
45408 +static inline int make_readonly(unsigned long paddr)
45409 +{
45410 +       int readonly = 0;
45411 +
45412 +       /* Make new page tables read-only. */
45413 +       if (!xen_feature(XENFEAT_writable_page_tables)
45414 +           && (paddr >= (table_start << PAGE_SHIFT))
45415 +           && (paddr < (table_end << PAGE_SHIFT)))
45416 +               readonly = 1;
45417 +       /* Make old page tables read-only. */
45418 +       if (!xen_feature(XENFEAT_writable_page_tables)
45419 +           && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
45420 +           && (paddr < (start_pfn << PAGE_SHIFT)))
45421 +               readonly = 1;
45422 +
45423 +       /*
45424 +        * No need for writable mapping of kernel image. This also ensures that
45425 +        * page and descriptor tables embedded inside don't have writable
45426 +        * mappings. 
45427 +        */
45428 +       if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
45429 +               readonly = 1;
45430 +
45431 +       return readonly;
45432 +}
45433 +
45434 +#ifndef CONFIG_XEN
45435 +/* Must run before zap_low_mappings */
45436 +__init void *early_ioremap(unsigned long addr, unsigned long size)
45437 +{
45438 +       unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 
45439 +
45440 +       /* actually usually some more */
45441 +       if (size >= LARGE_PAGE_SIZE) { 
45442 +               return NULL;
45443 +       }
45444 +       set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
45445 +       map += LARGE_PAGE_SIZE;
45446 +       set_pmd(temp_mappings[1].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
45447 +       __flush_tlb();
45448 +       return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
45449 +}
45450 +
45451 +/* To avoid virtual aliases later */
45452 +__init void early_iounmap(void *addr, unsigned long size)
45453 +{
45454 +       if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
45455 +               printk("early_iounmap: bad address %p\n", addr);
45456 +       set_pmd(temp_mappings[0].pmd, __pmd(0));
45457 +       set_pmd(temp_mappings[1].pmd, __pmd(0));
45458 +       __flush_tlb();
45459 +}
45460 +#endif /* !CONFIG_XEN */
45461 +
45462 +static void __meminit
45463 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
45464 +{
45465 +       int i = pmd_index(address);
45466 +       int k;
45467 +
45468 +       for (; i < PTRS_PER_PMD; i++) {
45469 +               unsigned long pte_phys;
45470 +               pte_t *pte, *pte_save;
45471 +               pmd_t *pmd = pmd_page + pmd_index(address);
45472 +
45473 +               if (address >= end) {
45474 +                       if (!after_bootmem)
45475 +                               for (; i < PTRS_PER_PMD; i++, pmd++)
45476 +                                       set_pmd(pmd, __pmd(0));
45477 +                       break;
45478 +               }
45479 +
45480 +               if (pmd_val(*pmd))
45481 +                       continue;
45482 +
45483 +               pte = alloc_static_page(&pte_phys);
45484 +               pte_save = pte;
45485 +               for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
45486 +                       if ((address >= end) ||
45487 +                           ((address >> PAGE_SHIFT) >=
45488 +                            xen_start_info->nr_pages)) { 
45489 +                               __set_pte(pte, __pte(0)); 
45490 +                               continue;
45491 +                       }
45492 +                       if (make_readonly(address)) {
45493 +                               __set_pte(pte, 
45494 +                                         __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
45495 +                               continue;
45496 +                       }
45497 +                       __set_pte(pte, __pte(address | _KERNPG_TABLE));
45498 +               }
45499 +               pte = pte_save;
45500 +               early_make_page_readonly(pte, XENFEAT_writable_page_tables);
45501 +               set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
45502 +       }
45503 +}
45504 +
45505 +static void __meminit
45506 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
45507 +{
45508 +       pmd_t *pmd = pmd_offset(pud,0);
45509 +       spin_lock(&init_mm.page_table_lock);
45510 +       phys_pmd_init(pmd, address, end);
45511 +       spin_unlock(&init_mm.page_table_lock);
45512 +       __flush_tlb_all();
45513 +}
45514 +
45515 +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
45516 +{ 
45517 +       int i = pud_index(addr);
45518 +
45519 +
45520 +       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
45521 +               unsigned long pmd_phys;
45522 +               pud_t *pud = pud_page + pud_index(addr);
45523 +               pmd_t *pmd;
45524 +
45525 +               if (addr >= end)
45526 +                       break;
45527 +
45528 +               if (pud_val(*pud)) {
45529 +                       phys_pmd_update(pud, addr, end);
45530 +                       continue;
45531 +               }
45532 +
45533 +               pmd = alloc_static_page(&pmd_phys);
45534 +               early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
45535 +               spin_lock(&init_mm.page_table_lock);
45536 +               set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
45537 +               phys_pmd_init(pmd, addr, end);
45538 +               spin_unlock(&init_mm.page_table_lock);
45539 +       }
45540 +       __flush_tlb();
45541 +} 
45542 +
45543 +void __init xen_init_pt(void)
45544 +{
45545 +       unsigned long addr, *page;
45546 +
45547 +       memset((void *)init_level4_pgt,   0, PAGE_SIZE);
45548 +       memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
45549 +       memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
45550 +
45551 +       /* Find the initial pte page that was built for us. */
45552 +       page = (unsigned long *)xen_start_info->pt_base;
45553 +       addr = page[pgd_index(__START_KERNEL_map)];
45554 +       addr_to_page(addr, page);
45555 +       addr = page[pud_index(__START_KERNEL_map)];
45556 +       addr_to_page(addr, page);
45557 +
45558 +#ifdef CONFIG_XEN_COMPAT_030002
45559 +       /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
45560 +          in kernel PTEs. We check that here. */
45561 +       if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
45562 +               unsigned long *pg;
45563 +               pte_t pte;
45564 +
45565 +               /* Mess with the initial mapping of page 0. It's not needed. */
45566 +               BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
45567 +               addr = page[pmd_index(__START_KERNEL_map)];
45568 +               addr_to_page(addr, pg);
45569 +               pte.pte = pg[pte_index(__START_KERNEL_map)];
45570 +               BUG_ON(!(pte.pte & _PAGE_PRESENT));
45571 +
45572 +               /* If _PAGE_USER isn't set, we obviously do not need it. */
45573 +               if (pte.pte & _PAGE_USER) {
45574 +                       /* _PAGE_USER is needed, but is it set implicitly? */
45575 +                       pte.pte &= ~_PAGE_USER;
45576 +                       if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
45577 +                                                         pte, 0) != 0) ||
45578 +                           !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
45579 +                               /* We need to explicitly specify _PAGE_USER. */
45580 +                               __kernel_page_user = _PAGE_USER;
45581 +               }
45582 +       }
45583 +#endif
45584 +
45585 +       /* Construct mapping of initial pte page in our own directories. */
45586 +       init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
45587 +               mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
45588 +       level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
45589 +               __pud(__pa_symbol(level2_kernel_pgt) |
45590 +                     _KERNPG_TABLE);
45591 +       memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
45592 +
45593 +       early_make_page_readonly(init_level4_pgt,
45594 +                                XENFEAT_writable_page_tables);
45595 +       early_make_page_readonly(init_level4_user_pgt,
45596 +                                XENFEAT_writable_page_tables);
45597 +       early_make_page_readonly(level3_kernel_pgt,
45598 +                                XENFEAT_writable_page_tables);
45599 +       early_make_page_readonly(level3_user_pgt,
45600 +                                XENFEAT_writable_page_tables);
45601 +       early_make_page_readonly(level2_kernel_pgt,
45602 +                                XENFEAT_writable_page_tables);
45603 +
45604 +       xen_pgd_pin(__pa_symbol(init_level4_pgt));
45605 +       xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
45606 +
45607 +       set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
45608 +               mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
45609 +}
45610 +
45611 +void __init extend_init_mapping(unsigned long tables_space)
45612 +{
45613 +       unsigned long va = __START_KERNEL_map;
45614 +       unsigned long phys, addr, *pte_page;
45615 +       pmd_t *pmd;
45616 +       pte_t *pte, new_pte;
45617 +       unsigned long *page = (unsigned long *)init_level4_pgt;
45618 +
45619 +       addr = page[pgd_index(va)];
45620 +       addr_to_page(addr, page);
45621 +       addr = page[pud_index(va)];
45622 +       addr_to_page(addr, page);
45623 +
45624 +       /* Kill mapping of low 1MB. */
45625 +       while (va < (unsigned long)&_text) {
45626 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
45627 +               va += PAGE_SIZE;
45628 +       }
45629 +
45630 +       /* Ensure init mappings cover kernel text/data and initial tables. */
45631 +       while (va < (__START_KERNEL_map
45632 +                    + (start_pfn << PAGE_SHIFT)
45633 +                    + tables_space)) {
45634 +               pmd = (pmd_t *)&page[pmd_index(va)];
45635 +               if (pmd_none(*pmd)) {
45636 +                       pte_page = alloc_static_page(&phys);
45637 +                       early_make_page_readonly(
45638 +                               pte_page, XENFEAT_writable_page_tables);
45639 +                       set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
45640 +               } else {
45641 +                       addr = page[pmd_index(va)];
45642 +                       addr_to_page(addr, pte_page);
45643 +               }
45644 +               pte = (pte_t *)&pte_page[pte_index(va)];
45645 +               if (pte_none(*pte)) {
45646 +                       new_pte = pfn_pte(
45647 +                               (va - __START_KERNEL_map) >> PAGE_SHIFT, 
45648 +                               __pgprot(_KERNPG_TABLE));
45649 +                       xen_l1_entry_update(pte, new_pte);
45650 +               }
45651 +               va += PAGE_SIZE;
45652 +       }
45653 +
45654 +       /* Finally, blow away any spurious initial mappings. */
45655 +       while (1) {
45656 +               pmd = (pmd_t *)&page[pmd_index(va)];
45657 +               if (pmd_none(*pmd))
45658 +                       break;
45659 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
45660 +               va += PAGE_SIZE;
45661 +       }
45662 +}
45663 +
45664 +static void __init find_early_table_space(unsigned long end)
45665 +{
45666 +       unsigned long puds, pmds, ptes, tables; 
45667 +
45668 +       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
45669 +       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
45670 +       ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
45671 +
45672 +       tables = round_up(puds * 8, PAGE_SIZE) + 
45673 +               round_up(pmds * 8, PAGE_SIZE) + 
45674 +               round_up(ptes * 8, PAGE_SIZE); 
45675 +
45676 +       extend_init_mapping(tables);
45677 +
45678 +       table_start = start_pfn;
45679 +       table_end = table_start + (tables>>PAGE_SHIFT);
45680 +
45681 +       early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
45682 +               end, table_start << PAGE_SHIFT,
45683 +                    (table_end << PAGE_SHIFT) + tables);
45684 +}
45685 +
45686 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
45687 +   This runs before bootmem is initialized and gets pages directly from the 
45688 +   physical memory. To access them they are temporarily mapped. */
45689 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
45690 +{ 
45691 +       unsigned long next; 
45692 +
45693 +       Dprintk("init_memory_mapping\n");
45694 +
45695 +       /* 
45696 +        * Find space for the kernel direct mapping tables.
45697 +        * Later we should allocate these tables in the local node of the memory
45698 +        * mapped.  Unfortunately this is done currently before the nodes are 
45699 +        * discovered.
45700 +        */
45701 +       if (!after_bootmem)
45702 +               find_early_table_space(end);
45703 +
45704 +       start = (unsigned long)__va(start);
45705 +       end = (unsigned long)__va(end);
45706 +
45707 +       for (; start < end; start = next) {
45708 +               unsigned long pud_phys; 
45709 +               pgd_t *pgd = pgd_offset_k(start);
45710 +               pud_t *pud;
45711 +
45712 +               if (after_bootmem) {
45713 +                       pud = pud_offset(pgd, start & PGDIR_MASK);
45714 +                       make_page_readonly(pud, XENFEAT_writable_page_tables);
45715 +                       pud_phys = __pa(pud);
45716 +               } else {
45717 +                       pud = alloc_static_page(&pud_phys);
45718 +                       early_make_page_readonly(pud, XENFEAT_writable_page_tables);
45719 +               }
45720 +               next = start + PGDIR_SIZE;
45721 +               if (next > end) 
45722 +                       next = end; 
45723 +               phys_pud_init(pud, __pa(start), __pa(next));
45724 +               if (!after_bootmem)
45725 +                       set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
45726 +       }
45727 +
45728 +       if (!after_bootmem) {
45729 +               BUG_ON(start_pfn != table_end);
45730 +
45731 +               /* Re-vector virtual addresses pointing into the initial
45732 +                  mapping to the just-established permanent ones. */
45733 +               xen_start_info = __va(__pa(xen_start_info));
45734 +               xen_start_info->pt_base = (unsigned long)
45735 +                       __va(__pa(xen_start_info->pt_base));
45736 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
45737 +                       phys_to_machine_mapping =
45738 +                               __va(__pa(xen_start_info->mfn_list));
45739 +                       xen_start_info->mfn_list = (unsigned long)
45740 +                               phys_to_machine_mapping;
45741 +               }
45742 +               if (xen_start_info->mod_start)
45743 +                       xen_start_info->mod_start = (unsigned long)
45744 +                               __va(__pa(xen_start_info->mod_start));
45745 +
45746 +               /* Destroy the Xen-created mappings beyond the kernel image as
45747 +                * well as the temporary mappings created above. Prevents
45748 +                * overlap with modules area (if init mapping is very big).
45749 +                */
45750 +               start = PAGE_ALIGN((unsigned long)_end);
45751 +               end   = __START_KERNEL_map + (table_end << PAGE_SHIFT);
45752 +               for (; start < end; start += PAGE_SIZE)
45753 +                       WARN_ON(HYPERVISOR_update_va_mapping(
45754 +                               start, __pte_ma(0), 0));
45755 +       }
45756 +
45757 +       __flush_tlb_all();
45758 +}
45759 +
45760 +void __cpuinit zap_low_mappings(int cpu)
45761 +{
45762 +       /* this is not required for Xen */
45763 +#if 0
45764 +       swap_low_mappings();
45765 +#endif
45766 +}
45767 +
45768 +#ifndef CONFIG_NUMA
45769 +void __init paging_init(void)
45770 +{
45771 +       int i;
45772 +       unsigned long max_zone_pfns[MAX_NR_ZONES];
45773 +       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
45774 +       max_zone_pfns[ZONE_DMA] = end_pfn;
45775 +       max_zone_pfns[ZONE_DMA32] = end_pfn;
45776 +       max_zone_pfns[ZONE_NORMAL] = end_pfn;
45777 +
45778 +       memory_present(0, 0, end_pfn);
45779 +       sparse_init();
45780 +       free_area_init_nodes(max_zone_pfns);
45781 +
45782 +       /* Switch to the real shared_info page, and clear the
45783 +        * dummy page. */
45784 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
45785 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
45786 +       memset(empty_zero_page, 0, sizeof(empty_zero_page));
45787 +
45788 +       init_mm.context.pinned = 1;
45789 +
45790 +       /* Setup mapping of lower 1st MB */
45791 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
45792 +               if (is_initial_xendomain())
45793 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
45794 +               else
45795 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
45796 +                                    virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
45797 +                                    PAGE_KERNEL_RO);
45798 +}
45799 +#endif
45800 +
45801 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
45802 +   from the CPU leading to inconsistent cache lines. address and size
45803 +   must be aligned to 2MB boundaries. 
45804 +   Does nothing when the mapping doesn't exist. */
45805 +void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
45806 +{
45807 +       unsigned long end = address + size;
45808 +
45809 +       BUG_ON(address & ~LARGE_PAGE_MASK);
45810 +       BUG_ON(size & ~LARGE_PAGE_MASK); 
45811 +       
45812 +       for (; address < end; address += LARGE_PAGE_SIZE) { 
45813 +               pgd_t *pgd = pgd_offset_k(address);
45814 +               pud_t *pud;
45815 +               pmd_t *pmd;
45816 +               if (pgd_none(*pgd))
45817 +                       continue;
45818 +               pud = pud_offset(pgd, address);
45819 +               if (pud_none(*pud))
45820 +                       continue; 
45821 +               pmd = pmd_offset(pud, address);
45822 +               if (!pmd || pmd_none(*pmd))
45823 +                       continue; 
45824 +               if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
45825 +                       /* Could handle this, but it should not happen currently. */
45826 +                       printk(KERN_ERR 
45827 +              "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
45828 +                       pmd_ERROR(*pmd); 
45829 +               }
45830 +               set_pmd(pmd, __pmd(0));                 
45831 +       }
45832 +       __flush_tlb_all();
45833 +} 
45834 +
45835 +/*
45836 + * Memory hotplug specific functions
45837 + */
45838 +void online_page(struct page *page)
45839 +{
45840 +       ClearPageReserved(page);
45841 +       init_page_count(page);
45842 +       __free_page(page);
45843 +       totalram_pages++;
45844 +       num_physpages++;
45845 +}
45846 +
45847 +#ifdef CONFIG_MEMORY_HOTPLUG
45848 +/*
45849 + * Memory is added always to NORMAL zone. This means you will never get
45850 + * additional DMA/DMA32 memory.
45851 + */
45852 +int arch_add_memory(int nid, u64 start, u64 size)
45853 +{
45854 +       struct pglist_data *pgdat = NODE_DATA(nid);
45855 +       struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
45856 +       unsigned long start_pfn = start >> PAGE_SHIFT;
45857 +       unsigned long nr_pages = size >> PAGE_SHIFT;
45858 +       int ret;
45859 +
45860 +       init_memory_mapping(start, (start + size -1));
45861 +
45862 +       ret = __add_pages(zone, start_pfn, nr_pages);
45863 +       if (ret)
45864 +               goto error;
45865 +
45866 +       return ret;
45867 +error:
45868 +       printk("%s: Problem encountered in __add_pages!\n", __func__);
45869 +       return ret;
45870 +}
45871 +EXPORT_SYMBOL_GPL(arch_add_memory);
45872 +
45873 +int remove_memory(u64 start, u64 size)
45874 +{
45875 +       return -EINVAL;
45876 +}
45877 +EXPORT_SYMBOL_GPL(remove_memory);
45878 +
45879 +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
45880 +int memory_add_physaddr_to_nid(u64 start)
45881 +{
45882 +       return 0;
45883 +}
45884 +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
45885 +#endif
45886 +
45887 +#endif /* CONFIG_MEMORY_HOTPLUG */
45888 +
45889 +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
45890 +/*
45891 + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
45892 + * just online the pages.
45893 + */
45894 +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
45895 +{
45896 +       int err = -EIO;
45897 +       unsigned long pfn;
45898 +       unsigned long total = 0, mem = 0;
45899 +       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
45900 +               if (pfn_valid(pfn)) {
45901 +                       online_page(pfn_to_page(pfn));
45902 +                       err = 0;
45903 +                       mem++;
45904 +               }
45905 +               total++;
45906 +       }
45907 +       if (!err) {
45908 +               z->spanned_pages += total;
45909 +               z->present_pages += mem;
45910 +               z->zone_pgdat->node_spanned_pages += total;
45911 +               z->zone_pgdat->node_present_pages += mem;
45912 +       }
45913 +       return err;
45914 +}
45915 +#endif
45916 +
45917 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
45918 +                        kcore_vsyscall;
45919 +
45920 +void __init mem_init(void)
45921 +{
45922 +       long codesize, reservedpages, datasize, initsize;
45923 +       unsigned long pfn;
45924 +
45925 +       contiguous_bitmap = alloc_bootmem_low_pages(
45926 +               (end_pfn + 2*BITS_PER_LONG) >> 3);
45927 +       BUG_ON(!contiguous_bitmap);
45928 +       memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
45929 +
45930 +       pci_iommu_alloc();
45931 +
45932 +       /* clear the zero-page */
45933 +       memset(empty_zero_page, 0, PAGE_SIZE);
45934 +
45935 +       reservedpages = 0;
45936 +
45937 +       /* this will put all low memory onto the freelists */
45938 +#ifdef CONFIG_NUMA
45939 +       totalram_pages = numa_free_all_bootmem();
45940 +#else
45941 +       totalram_pages = free_all_bootmem();
45942 +#endif
45943 +       /* XEN: init and count pages outside initial allocation. */
45944 +       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
45945 +               ClearPageReserved(pfn_to_page(pfn));
45946 +               init_page_count(pfn_to_page(pfn));
45947 +               totalram_pages++;
45948 +       }
45949 +       reservedpages = end_pfn - totalram_pages -
45950 +                                       absent_pages_in_range(0, end_pfn);
45951 +
45952 +
45953 +       after_bootmem = 1;
45954 +
45955 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
45956 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
45957 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
45958 +
45959 +       /* Register memory areas for /proc/kcore */
45960 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
45961 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
45962 +                  VMALLOC_END-VMALLOC_START);
45963 +       kclist_add(&kcore_kernel, &_stext, _end - _stext);
45964 +       kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
45965 +       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
45966 +                                VSYSCALL_END - VSYSCALL_START);
45967 +
45968 +       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
45969 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
45970 +               end_pfn << (PAGE_SHIFT-10),
45971 +               codesize >> 10,
45972 +               reservedpages << (PAGE_SHIFT-10),
45973 +               datasize >> 10,
45974 +               initsize >> 10);
45975 +
45976 +#ifndef CONFIG_XEN
45977 +#ifdef CONFIG_SMP
45978 +       /*
45979 +        * Sync boot_level4_pgt mappings with the init_level4_pgt
45980 +        * except for the low identity mappings which are already zapped
45981 +        * in init_level4_pgt. This sync-up is essential for AP's bringup
45982 +        */
45983 +       memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
45984 +#endif
45985 +#endif
45986 +}
45987 +
45988 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
45989 +{
45990 +       unsigned long addr;
45991 +
45992 +       if (begin >= end)
45993 +               return;
45994 +
45995 +       printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
45996 +       for (addr = begin; addr < end; addr += PAGE_SIZE) {
45997 +               ClearPageReserved(virt_to_page(addr));
45998 +               init_page_count(virt_to_page(addr));
45999 +               memset((void *)(addr & ~(PAGE_SIZE-1)),
46000 +                       POISON_FREE_INITMEM, PAGE_SIZE);
46001 +               free_page(addr);
46002 +               totalram_pages++;
46003 +       }
46004 +}
46005 +
46006 +void free_initmem(void)
46007 +{
46008 +       memset(__initdata_begin, POISON_FREE_INITDATA,
46009 +              __initdata_end - __initdata_begin);
46010 +#ifdef __DO_LATER__
46011 +       free_init_pages("unused kernel memory",
46012 +                       (unsigned long)(&__init_begin),
46013 +                       (unsigned long)(&__init_end));
46014 +#endif
46015 +}
46016 +
46017 +#ifdef CONFIG_DEBUG_RODATA
46018 +
46019 +void mark_rodata_ro(void)
46020 +{
46021 +       unsigned long addr = (unsigned long)__start_rodata;
46022 +
46023 +       for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
46024 +               change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
46025 +
46026 +       printk ("Write protecting the kernel read-only data: %luk\n",
46027 +                       (__end_rodata - __start_rodata) >> 10);
46028 +
46029 +       /*
46030 +        * change_page_attr_addr() requires a global_flush_tlb() call after it.
46031 +        * We do this after the printk so that if something went wrong in the
46032 +        * change, the printk gets out at least to give a better debug hint
46033 +        * of who is the culprit.
46034 +        */
46035 +       global_flush_tlb();
46036 +}
46037 +#endif
46038 +
46039 +#ifdef CONFIG_BLK_DEV_INITRD
46040 +void free_initrd_mem(unsigned long start, unsigned long end)
46041 +{
46042 +       free_init_pages("initrd memory", start, end);
46043 +}
46044 +#endif
46045 +
46046 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
46047 +{ 
46048 +#ifdef CONFIG_NUMA
46049 +       int nid = phys_to_nid(phys);
46050 +#endif
46051 +       unsigned long pfn = phys >> PAGE_SHIFT;
46052 +       if (pfn >= end_pfn) {
46053 +               /* This can happen with kdump kernels when accessing firmware
46054 +                  tables. */
46055 +               if (pfn < end_pfn_map)
46056 +                       return;
46057 +               printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
46058 +                               phys, len);
46059 +               return;
46060 +       }
46061 +
46062 +       /* Should check here against the e820 map to avoid double free */
46063 +#ifdef CONFIG_NUMA
46064 +       reserve_bootmem_node(NODE_DATA(nid), phys, len);
46065 +#else                  
46066 +       reserve_bootmem(phys, len);    
46067 +#endif
46068 +       if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
46069 +               dma_reserve += len / PAGE_SIZE;
46070 +               set_dma_reserve(dma_reserve);
46071 +       }
46072 +}
46073 +
46074 +int kern_addr_valid(unsigned long addr) 
46075 +{ 
46076 +       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
46077 +       pgd_t *pgd;
46078 +       pud_t *pud;
46079 +       pmd_t *pmd;
46080 +       pte_t *pte;
46081 +
46082 +       if (above != 0 && above != -1UL)
46083 +               return 0; 
46084 +       
46085 +       pgd = pgd_offset_k(addr);
46086 +       if (pgd_none(*pgd))
46087 +               return 0;
46088 +
46089 +       pud = pud_offset_k(pgd, addr);
46090 +       if (pud_none(*pud))
46091 +               return 0; 
46092 +
46093 +       pmd = pmd_offset(pud, addr);
46094 +       if (pmd_none(*pmd))
46095 +               return 0;
46096 +       if (pmd_large(*pmd))
46097 +               return pfn_valid(pmd_pfn(*pmd));
46098 +
46099 +       pte = pte_offset_kernel(pmd, addr);
46100 +       if (pte_none(*pte))
46101 +               return 0;
46102 +       return pfn_valid(pte_pfn(*pte));
46103 +}
46104 +
46105 +#ifdef CONFIG_SYSCTL
46106 +#include <linux/sysctl.h>
46107 +
46108 +extern int exception_trace, page_fault_trace;
46109 +
46110 +static ctl_table debug_table2[] = {
46111 +       { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
46112 +         proc_dointvec },
46113 +       { 0, }
46114 +}; 
46115 +
46116 +static ctl_table debug_root_table2[] = { 
46117 +       { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
46118 +          .child = debug_table2 }, 
46119 +       { 0 }, 
46120 +}; 
46121 +
46122 +static __init int x8664_sysctl_init(void)
46123 +{ 
46124 +       register_sysctl_table(debug_root_table2, 1);
46125 +       return 0;
46126 +}
46127 +__initcall(x8664_sysctl_init);
46128 +#endif
46129 +
46130 +/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
46131 +   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
46132 +   not need special handling anymore. */
46133 +
46134 +static struct vm_area_struct gate_vma = {
46135 +       .vm_start = VSYSCALL_START,
46136 +       .vm_end = VSYSCALL_END,
46137 +       .vm_page_prot = PAGE_READONLY
46138 +};
46139 +
46140 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
46141 +{
46142 +#ifdef CONFIG_IA32_EMULATION
46143 +       if (test_tsk_thread_flag(tsk, TIF_IA32))
46144 +               return NULL;
46145 +#endif
46146 +       return &gate_vma;
46147 +}
46148 +
46149 +int in_gate_area(struct task_struct *task, unsigned long addr)
46150 +{
46151 +       struct vm_area_struct *vma = get_gate_vma(task);
46152 +       if (!vma)
46153 +               return 0;
46154 +       return (addr >= vma->vm_start) && (addr < vma->vm_end);
46155 +}
46156 +
46157 +/* Use this when you have no reliable task/vma, typically from interrupt
46158 + * context.  It is less reliable than using the task's vma and may give
46159 + * false positives.
46160 + */
46161 +int in_gate_area_no_task(unsigned long addr)
46162 +{
46163 +       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
46164 +}
46165 diff -ruNp linux-2.6.19/arch/x86_64/mm/pageattr-xen.c linux-2.6.19-xen-3.0.4/arch/x86_64/mm/pageattr-xen.c
46166 --- linux-2.6.19/arch/x86_64/mm/pageattr-xen.c  1970-01-01 00:00:00.000000000 +0000
46167 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/mm/pageattr-xen.c        2007-02-02 19:10:27.000000000 +0000
46168 @@ -0,0 +1,396 @@
46169 +/* 
46170 + * Copyright 2002 Andi Kleen, SuSE Labs. 
46171 + * Thanks to Ben LaHaise for precious feedback.
46172 + */ 
46173 +
46174 +#include <linux/mm.h>
46175 +#include <linux/sched.h>
46176 +#include <linux/highmem.h>
46177 +#include <linux/module.h>
46178 +#include <linux/slab.h>
46179 +#include <asm/uaccess.h>
46180 +#include <asm/processor.h>
46181 +#include <asm/tlbflush.h>
46182 +#include <asm/io.h>
46183 +
46184 +#ifdef CONFIG_XEN
46185 +#include <asm/pgalloc.h>
46186 +#include <asm/mmu_context.h>
46187 +
46188 +LIST_HEAD(mm_unpinned);
46189 +DEFINE_SPINLOCK(mm_unpinned_lock);
46190 +
46191 +static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
46192 +{
46193 +       struct page *page = virt_to_page(pt);
46194 +       unsigned long pfn = page_to_pfn(page);
46195 +
46196 +       BUG_ON(HYPERVISOR_update_va_mapping(
46197 +                      (unsigned long)__va(pfn << PAGE_SHIFT),
46198 +                      pfn_pte(pfn, flags), 0));
46199 +}
46200 +
46201 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
46202 +{
46203 +       pgd_t       *pgd;
46204 +       pud_t       *pud;
46205 +       pmd_t       *pmd;
46206 +       pte_t       *pte;
46207 +       int          g,u,m;
46208 +
46209 +       pgd = mm->pgd;
46210 +       /*
46211 +        * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
46212 +        * be the 'current' task's pagetables (e.g., current may be 32-bit,
46213 +        * but the pagetables may be for a 64-bit task).
46214 +        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
46215 +        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
46216 +        */
46217 +       for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
46218 +               if (pgd_none(*pgd))
46219 +                       continue;
46220 +               pud = pud_offset(pgd, 0);
46221 +               if (PTRS_PER_PUD > 1) /* not folded */ 
46222 +                       mm_walk_set_prot(pud,flags);
46223 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
46224 +                       if (pud_none(*pud))
46225 +                               continue;
46226 +                       pmd = pmd_offset(pud, 0);
46227 +                       if (PTRS_PER_PMD > 1) /* not folded */ 
46228 +                               mm_walk_set_prot(pmd,flags);
46229 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
46230 +                               if (pmd_none(*pmd))
46231 +                                       continue;
46232 +                               pte = pte_offset_kernel(pmd,0);
46233 +                               mm_walk_set_prot(pte,flags);
46234 +                       }
46235 +               }
46236 +       }
46237 +}
46238 +
46239 +void mm_pin(struct mm_struct *mm)
46240 +{
46241 +       if (xen_feature(XENFEAT_writable_page_tables))
46242 +               return;
46243 +
46244 +       spin_lock(&mm->page_table_lock);
46245 +
46246 +       mm_walk(mm, PAGE_KERNEL_RO);
46247 +       BUG_ON(HYPERVISOR_update_va_mapping(
46248 +                      (unsigned long)mm->pgd,
46249 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
46250 +                      UVMF_TLB_FLUSH));
46251 +       BUG_ON(HYPERVISOR_update_va_mapping(
46252 +                      (unsigned long)__user_pgd(mm->pgd),
46253 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO),
46254 +                      UVMF_TLB_FLUSH));
46255 +       xen_pgd_pin(__pa(mm->pgd)); /* kernel */
46256 +       xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
46257 +       mm->context.pinned = 1;
46258 +       spin_lock(&mm_unpinned_lock);
46259 +       list_del(&mm->context.unpinned);
46260 +       spin_unlock(&mm_unpinned_lock);
46261 +
46262 +       spin_unlock(&mm->page_table_lock);
46263 +}
46264 +
46265 +void mm_unpin(struct mm_struct *mm)
46266 +{
46267 +       if (xen_feature(XENFEAT_writable_page_tables))
46268 +               return;
46269 +
46270 +       spin_lock(&mm->page_table_lock);
46271 +
46272 +       xen_pgd_unpin(__pa(mm->pgd));
46273 +       xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
46274 +       BUG_ON(HYPERVISOR_update_va_mapping(
46275 +                      (unsigned long)mm->pgd,
46276 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
46277 +       BUG_ON(HYPERVISOR_update_va_mapping(
46278 +                      (unsigned long)__user_pgd(mm->pgd),
46279 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0));
46280 +       mm_walk(mm, PAGE_KERNEL);
46281 +       xen_tlb_flush();
46282 +       mm->context.pinned = 0;
46283 +       spin_lock(&mm_unpinned_lock);
46284 +       list_add(&mm->context.unpinned, &mm_unpinned);
46285 +       spin_unlock(&mm_unpinned_lock);
46286 +
46287 +       spin_unlock(&mm->page_table_lock);
46288 +}
46289 +
46290 +void mm_pin_all(void)
46291 +{
46292 +       if (xen_feature(XENFEAT_writable_page_tables))
46293 +               return;
46294 +
46295 +       while (!list_empty(&mm_unpinned))       
46296 +               mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
46297 +                                 context.unpinned));
46298 +}
46299 +
46300 +void _arch_dup_mmap(struct mm_struct *mm)
46301 +{
46302 +    if (!mm->context.pinned)
46303 +        mm_pin(mm);
46304 +}
46305 +
46306 +void _arch_exit_mmap(struct mm_struct *mm)
46307 +{
46308 +    struct task_struct *tsk = current;
46309 +
46310 +    task_lock(tsk);
46311 +
46312 +    /*
46313 +     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
46314 +     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
46315 +     */
46316 +    if ( tsk->active_mm == mm )
46317 +    {
46318 +        tsk->active_mm = &init_mm;
46319 +        atomic_inc(&init_mm.mm_count);
46320 +
46321 +        switch_mm(mm, &init_mm, tsk);
46322 +
46323 +        atomic_dec(&mm->mm_count);
46324 +        BUG_ON(atomic_read(&mm->mm_count) == 0);
46325 +    }
46326 +
46327 +    task_unlock(tsk);
46328 +
46329 +    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
46330 +         !mm->context.has_foreign_mappings )
46331 +        mm_unpin(mm);
46332 +}
46333 +
46334 +void pte_free(struct page *pte)
46335 +{
46336 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
46337 +
46338 +       if (!pte_write(*virt_to_ptep(va)))
46339 +               BUG_ON(HYPERVISOR_update_va_mapping(
46340 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
46341 +       __free_page(pte);
46342 +}
46343 +#endif /* CONFIG_XEN */
46344 +
46345 +static inline pte_t *lookup_address(unsigned long address) 
46346 +{ 
46347 +       pgd_t *pgd = pgd_offset_k(address);
46348 +       pud_t *pud;
46349 +       pmd_t *pmd;
46350 +       pte_t *pte;
46351 +       if (pgd_none(*pgd))
46352 +               return NULL;
46353 +       pud = pud_offset(pgd, address);
46354 +       if (!pud_present(*pud))
46355 +               return NULL; 
46356 +       pmd = pmd_offset(pud, address);
46357 +       if (!pmd_present(*pmd))
46358 +               return NULL; 
46359 +       if (pmd_large(*pmd))
46360 +               return (pte_t *)pmd;
46361 +       pte = pte_offset_kernel(pmd, address);
46362 +       if (pte && !pte_present(*pte))
46363 +               pte = NULL; 
46364 +       return pte;
46365 +} 
46366 +
46367 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
46368 +                                    pgprot_t ref_prot)
46369 +{ 
46370 +       int i; 
46371 +       unsigned long addr;
46372 +       struct page *base = alloc_pages(GFP_KERNEL, 0);
46373 +       pte_t *pbase;
46374 +       if (!base) 
46375 +               return NULL;
46376 +       /*
46377 +        * page_private is used to track the number of entries in
46378 +        * the page table page have non standard attributes.
46379 +        */
46380 +       SetPagePrivate(base);
46381 +       page_private(base) = 0;
46382 +
46383 +       address = __pa(address);
46384 +       addr = address & LARGE_PAGE_MASK; 
46385 +       pbase = (pte_t *)page_address(base);
46386 +       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
46387 +               pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
46388 +                                  addr == address ? prot : ref_prot);
46389 +       }
46390 +       return base;
46391 +} 
46392 +
46393 +
46394 +static void flush_kernel_map(void *address) 
46395 +{
46396 +       if (0 && address && cpu_has_clflush) {
46397 +               /* is this worth it? */ 
46398 +               int i;
46399 +               for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 
46400 +                       asm volatile("clflush (%0)" :: "r" (address + i)); 
46401 +       } else
46402 +               asm volatile("wbinvd":::"memory"); 
46403 +       if (address)
46404 +               __flush_tlb_one(address);
46405 +       else
46406 +               __flush_tlb_all();
46407 +}
46408 +
46409 +
46410 +static inline void flush_map(unsigned long address)
46411 +{      
46412 +       on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
46413 +}
46414 +
46415 +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
46416 +
46417 +static inline void save_page(struct page *fpage)
46418 +{
46419 +       fpage->lru.next = (struct list_head *)deferred_pages;
46420 +       deferred_pages = fpage;
46421 +}
46422 +
46423 +/* 
46424 + * No more special protections in this 2/4MB area - revert to a
46425 + * large page again. 
46426 + */
46427 +static void revert_page(unsigned long address, pgprot_t ref_prot)
46428 +{
46429 +       pgd_t *pgd;
46430 +       pud_t *pud;
46431 +       pmd_t *pmd;
46432 +       pte_t large_pte;
46433 +
46434 +       pgd = pgd_offset_k(address);
46435 +       BUG_ON(pgd_none(*pgd));
46436 +       pud = pud_offset(pgd,address);
46437 +       BUG_ON(pud_none(*pud));
46438 +       pmd = pmd_offset(pud, address);
46439 +       BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
46440 +       large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
46441 +       large_pte = pte_mkhuge(large_pte);
46442 +       set_pte((pte_t *)pmd, large_pte);
46443 +}      
46444 +
46445 +static int
46446 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
46447 +                                  pgprot_t ref_prot)
46448 +{ 
46449 +       pte_t *kpte; 
46450 +       struct page *kpte_page;
46451 +       pgprot_t ref_prot2;
46452 +       kpte = lookup_address(address);
46453 +       if (!kpte) return 0;
46454 +       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
46455 +       if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
46456 +               if (!pte_huge(*kpte)) {
46457 +                       set_pte(kpte, pfn_pte(pfn, prot));
46458 +               } else {
46459 +                       /*
46460 +                        * split_large_page will take the reference for this
46461 +                        * change_page_attr on the split page.
46462 +                        */
46463 +                       struct page *split;
46464 +                       ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
46465 +                       split = split_large_page(address, prot, ref_prot2);
46466 +                       if (!split)
46467 +                               return -ENOMEM;
46468 +                       set_pte(kpte, mk_pte(split, ref_prot2));
46469 +                       kpte_page = split;
46470 +               }
46471 +               page_private(kpte_page)++;
46472 +       } else if (!pte_huge(*kpte)) {
46473 +               set_pte(kpte, pfn_pte(pfn, ref_prot));
46474 +               BUG_ON(page_private(kpte_page) == 0);
46475 +               page_private(kpte_page)--;
46476 +       } else
46477 +               BUG();
46478 +
46479 +       /* on x86-64 the direct mapping set at boot is not using 4k pages */
46480 +       /*
46481 +        * ..., but the XEN guest kernels (currently) do:
46482 +        * If the pte was reserved, it means it was created at boot
46483 +        * time (not via split_large_page) and in turn we must not
46484 +        * replace it with a large page.
46485 +        */
46486 +#ifndef CONFIG_XEN
46487 +       BUG_ON(PageReserved(kpte_page));
46488 +#else
46489 +       if(!PageReserved(kpte_page))
46490 +#endif
46491 +               if (page_private(kpte_page) == 0) {
46492 +                       save_page(kpte_page);
46493 +                       revert_page(address, ref_prot);
46494 +               }
46495 +       return 0;
46496 +} 
46497 +
46498 +/*
46499 + * Change the page attributes of an page in the linear mapping.
46500 + *
46501 + * This should be used when a page is mapped with a different caching policy
46502 + * than write-back somewhere - some CPUs do not like it when mappings with
46503 + * different caching policies exist. This changes the page attributes of the
46504 + * in kernel linear mapping too.
46505 + * 
46506 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
46507 + * This function only deals with the kernel linear map.
46508 + * 
46509 + * Caller must call global_flush_tlb() after this.
46510 + */
46511 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
46512 +{
46513 +       int err = 0; 
46514 +       int i; 
46515 +
46516 +       down_write(&init_mm.mmap_sem);
46517 +       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
46518 +               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
46519 +
46520 +               err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
46521 +               if (err) 
46522 +                       break; 
46523 +               /* Handle kernel mapping too which aliases part of the
46524 +                * lowmem */
46525 +               if (__pa(address) < KERNEL_TEXT_SIZE) {
46526 +                       unsigned long addr2;
46527 +                       pgprot_t prot2;
46528 +                       addr2 = __START_KERNEL_map + __pa(address);
46529 +                       /* Make sure the kernel mappings stay executable */
46530 +                       prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
46531 +                       err = __change_page_attr(addr2, pfn, prot2,
46532 +                                                PAGE_KERNEL_EXEC);
46533 +               } 
46534 +       }       
46535 +       up_write(&init_mm.mmap_sem); 
46536 +       return err;
46537 +}
46538 +
46539 +/* Don't call this for MMIO areas that may not have a mem_map entry */
46540 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
46541 +{
46542 +       unsigned long addr = (unsigned long)page_address(page);
46543 +       return change_page_attr_addr(addr, numpages, prot);
46544 +}
46545 +
46546 +void global_flush_tlb(void)
46547 +{ 
46548 +       struct page *dpage;
46549 +
46550 +       down_read(&init_mm.mmap_sem);
46551 +       dpage = xchg(&deferred_pages, NULL);
46552 +       up_read(&init_mm.mmap_sem);
46553 +
46554 +       flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
46555 +       while (dpage) {
46556 +               struct page *tmp = dpage;
46557 +               dpage = (struct page *)dpage->lru.next;
46558 +               ClearPagePrivate(tmp);
46559 +               __free_page(tmp);
46560 +       } 
46561 +} 
46562 +
46563 +EXPORT_SYMBOL(change_page_attr);
46564 +EXPORT_SYMBOL(global_flush_tlb);
46565 diff -ruNp linux-2.6.19/arch/x86_64/oprofile/Makefile linux-2.6.19-xen-3.0.4/arch/x86_64/oprofile/Makefile
46566 --- linux-2.6.19/arch/x86_64/oprofile/Makefile  2006-11-29 21:57:37.000000000 +0000
46567 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/oprofile/Makefile        2007-02-02 19:10:27.000000000 +0000
46568 @@ -11,9 +11,15 @@ DRIVER_OBJS = $(addprefix ../../../drive
46569         oprofilefs.o oprofile_stats.o \
46570         timer_int.o )
46571  
46572 +ifdef CONFIG_XEN
46573 +XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
46574 +                        xenoprofile.o)
46575 +OPROFILE-y := xenoprof.o
46576 +else
46577  OPROFILE-y := init.o backtrace.o
46578  OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
46579                                      op_model_ppro.o
46580  OPROFILE-$(CONFIG_X86_IO_APIC)    += nmi_timer_int.o 
46581 -
46582 -oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
46583 +endif
46584 +oprofile-y = $(DRIVER_OBJS) $(XENOPROF_COMMON_OBJS) \
46585 +            $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
46586 diff -ruNp linux-2.6.19/arch/x86_64/pci/Makefile linux-2.6.19-xen-3.0.4/arch/x86_64/pci/Makefile
46587 --- linux-2.6.19/arch/x86_64/pci/Makefile       2006-11-29 21:57:37.000000000 +0000
46588 +++ linux-2.6.19-xen-3.0.4/arch/x86_64/pci/Makefile     2007-02-02 19:10:27.000000000 +0000
46589 @@ -15,8 +15,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
46590  
46591  obj-$(CONFIG_NUMA)     += k8-bus.o
46592  
46593 +# pcifront should be after mmconfig.o and direct.o as it should only
46594 +# take over if direct access to the PCI bus is unavailable
46595 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
46596 +
46597  direct-y += ../../i386/pci/direct.o
46598  acpi-y   += ../../i386/pci/acpi.o
46599 +pcifront-y += ../../i386/pci/pcifront.o
46600  legacy-y += ../../i386/pci/legacy.o
46601  irq-y    += ../../i386/pci/irq.o
46602  common-y += ../../i386/pci/common.o
46603 @@ -24,3 +29,10 @@ fixup-y  += ../../i386/pci/fixup.o
46604  i386-y  += ../../i386/pci/i386.o
46605  init-y += ../../i386/pci/init.o
46606  early-y += ../../i386/pci/early.o
46607 +
46608 +ifdef CONFIG_XEN
46609 +irq-y          := ../../i386/pci/irq-xen.o
46610 +include $(srctree)/scripts/Makefile.xen
46611 +
46612 +obj-y := $(call cherrypickxen, $(obj-y))
46613 +endif
46614 diff -ruNp linux-2.6.19/drivers/Makefile linux-2.6.19-xen-3.0.4/drivers/Makefile
46615 --- linux-2.6.19/drivers/Makefile       2006-11-29 21:57:37.000000000 +0000
46616 +++ linux-2.6.19-xen-3.0.4/drivers/Makefile     2007-02-02 19:10:27.000000000 +0000
46617 @@ -31,6 +31,7 @@ obj-y                         += base/ block/ misc/ mfd/ net/
46618  obj-$(CONFIG_NUBUS)            += nubus/
46619  obj-$(CONFIG_ATM)              += atm/
46620  obj-$(CONFIG_PPC_PMAC)         += macintosh/
46621 +obj-$(CONFIG_XEN)              += xen/
46622  obj-$(CONFIG_IDE)              += ide/
46623  obj-$(CONFIG_FC4)              += fc4/
46624  obj-$(CONFIG_SCSI)             += scsi/
46625 diff -ruNp linux-2.6.19/drivers/acpi/Kconfig linux-2.6.19-xen-3.0.4/drivers/acpi/Kconfig
46626 --- linux-2.6.19/drivers/acpi/Kconfig   2006-11-29 21:57:37.000000000 +0000
46627 +++ linux-2.6.19-xen-3.0.4/drivers/acpi/Kconfig 2007-02-02 19:10:27.000000000 +0000
46628 @@ -45,7 +45,7 @@ if ACPI
46629  
46630  config ACPI_SLEEP
46631         bool "Sleep States"
46632 -       depends on X86 && (!SMP || SUSPEND_SMP)
46633 +       depends on X86 && (!SMP || SUSPEND_SMP) && !XEN
46634         depends on PM
46635         default y
46636         ---help---
46637 @@ -305,6 +305,7 @@ config ACPI_SYSTEM
46638  config X86_PM_TIMER
46639         bool "Power Management Timer Support" if EMBEDDED
46640         depends on X86
46641 +       depends on !XEN
46642         default y
46643         help
46644           The Power Management Timer is available on all ACPI-capable,
46645 diff -ruNp linux-2.6.19/drivers/char/hangcheck-timer.c linux-2.6.19-xen-3.0.4/drivers/char/hangcheck-timer.c
46646 --- linux-2.6.19/drivers/char/hangcheck-timer.c 2006-11-29 21:57:37.000000000 +0000
46647 +++ linux-2.6.19-xen-3.0.4/drivers/char/hangcheck-timer.c       2007-02-02 19:10:30.000000000 +0000
46648 @@ -117,7 +117,7 @@ __setup("hcheck_reboot", hangcheck_parse
46649  __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
46650  #endif /* not MODULE */
46651  
46652 -#if defined(CONFIG_X86_64) || defined(CONFIG_S390)
46653 +#if defined(CONFIG_X86_64) || defined(CONFIG_S390) || defined(CONFIG_X86_XEN)
46654  # define HAVE_MONOTONIC
46655  # define TIMER_FREQ 1000000000ULL
46656  #elif defined(CONFIG_IA64)
46657 diff -ruNp linux-2.6.19/drivers/char/mem.c linux-2.6.19-xen-3.0.4/drivers/char/mem.c
46658 --- linux-2.6.19/drivers/char/mem.c     2007-02-02 20:26:43.000000000 +0000
46659 +++ linux-2.6.19-xen-3.0.4/drivers/char/mem.c   2007-02-02 19:10:30.000000000 +0000
46660 @@ -102,6 +102,7 @@ static inline int valid_mmap_phys_addr_r
46661  }
46662  #endif
46663  
46664 +#ifndef ARCH_HAS_DEV_MEM
46665  /*
46666   * This funcion reads the *physical* memory. The f_pos points directly to the 
46667   * memory location. 
46668 @@ -224,6 +225,7 @@ static ssize_t write_mem(struct file * f
46669         *ppos += written;
46670         return written;
46671  }
46672 +#endif
46673  
46674  #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
46675  static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
46676 @@ -810,6 +812,7 @@ static int open_port(struct inode * inod
46677  #define open_kmem      open_mem
46678  #define open_oldmem    open_mem
46679  
46680 +#ifndef ARCH_HAS_DEV_MEM
46681  static const struct file_operations mem_fops = {
46682         .llseek         = memory_lseek,
46683         .read           = read_mem,
46684 @@ -818,6 +821,9 @@ static const struct file_operations mem_
46685         .open           = open_mem,
46686         .get_unmapped_area = get_unmapped_area_mem,
46687  };
46688 +#else
46689 +extern struct file_operations mem_fops;
46690 +#endif
46691  
46692  static const struct file_operations kmem_fops = {
46693         .llseek         = memory_lseek,
46694 diff -ruNp linux-2.6.19/drivers/char/tpm/Kconfig linux-2.6.19-xen-3.0.4/drivers/char/tpm/Kconfig
46695 --- linux-2.6.19/drivers/char/tpm/Kconfig       2006-11-29 21:57:37.000000000 +0000
46696 +++ linux-2.6.19-xen-3.0.4/drivers/char/tpm/Kconfig     2007-02-02 19:10:30.000000000 +0000
46697 @@ -31,7 +31,7 @@ config TCG_TIS
46698  
46699  config TCG_NSC
46700         tristate "National Semiconductor TPM Interface"
46701 -       depends on TCG_TPM && PNPACPI
46702 +       depends on TCG_TPM && PNPACPI && !XEN_UNPRIVILEGED_GUEST
46703         ---help---
46704           If you have a TPM security chip from National Semicondutor 
46705           say Yes and it will be accessible from within Linux.  To 
46706 @@ -58,5 +58,13 @@ config TCG_INFINEON
46707           Further information on this driver and the supported hardware
46708           can be found at http://www.prosec.rub.de/tpm
46709  
46710 -endmenu
46711 +config TCG_XEN
46712 +       tristate "XEN TPM Interface"
46713 +       depends on TCG_TPM && XEN
46714 +       ---help---
46715 +         If you want to make TPM support available to a Xen user domain,
46716 +         say Yes and it will be accessible from within Linux.
46717 +         To compile this driver as a module, choose M here; the module
46718 +         will be called tpm_xenu.
46719  
46720 +endmenu
46721 diff -ruNp linux-2.6.19/drivers/char/tpm/Makefile linux-2.6.19-xen-3.0.4/drivers/char/tpm/Makefile
46722 --- linux-2.6.19/drivers/char/tpm/Makefile      2006-11-29 21:57:37.000000000 +0000
46723 +++ linux-2.6.19-xen-3.0.4/drivers/char/tpm/Makefile    2007-02-02 19:10:30.000000000 +0000
46724 @@ -9,3 +9,5 @@ obj-$(CONFIG_TCG_TIS) += tpm_tis.o
46725  obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
46726  obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
46727  obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
46728 +obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
46729 +tpm_xenu-y = tpm_xen.o tpm_vtpm.o
46730 diff -ruNp linux-2.6.19/drivers/char/tpm/tpm.c linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm.c
46731 --- linux-2.6.19/drivers/char/tpm/tpm.c 2006-11-29 21:57:37.000000000 +0000
46732 +++ linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm.c       2007-02-02 19:10:30.000000000 +0000
46733 @@ -30,7 +30,9 @@
46734  
46735  enum tpm_const {
46736         TPM_MINOR = 224,        /* officially assigned */
46737 +#ifndef CONFIG_XEN
46738         TPM_BUFSIZE = 2048,
46739 +#endif
46740         TPM_NUM_DEVICES = 256,
46741  };
46742  
46743 @@ -331,7 +333,11 @@ static void timeout_work(void *ptr)
46744  
46745         down(&chip->buffer_mutex);
46746         atomic_set(&chip->data_pending, 0);
46747 +#ifndef CONFIG_XEN
46748         memset(chip->data_buffer, 0, TPM_BUFSIZE);
46749 +#else
46750 +       memset(chip->data_buffer, 0, get_chip_buffersize(chip));
46751 +#endif
46752         up(&chip->buffer_mutex);
46753  }
46754  
46755 @@ -921,7 +927,12 @@ int tpm_open(struct inode *inode, struct
46756  
46757         spin_unlock(&driver_lock);
46758  
46759 +#ifndef CONFIG_XEN
46760         chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL);
46761 +#else
46762 +       chip->data_buffer = kmalloc(get_chip_buffersize(chip) * sizeof(u8),
46763 +                                   GFP_KERNEL);
46764 +#endif
46765         if (chip->data_buffer == NULL) {
46766                 chip->num_opens--;
46767                 put_device(chip->dev);
46768 @@ -969,8 +980,13 @@ ssize_t tpm_write(struct file *file, con
46769  
46770         down(&chip->buffer_mutex);
46771  
46772 +#ifndef CONFIG_XEN
46773         if (in_size > TPM_BUFSIZE)
46774                 in_size = TPM_BUFSIZE;
46775 +#else
46776 +       if (in_size > get_chip_buffersize(chip))
46777 +               in_size = get_chip_buffersize(chip);
46778 +#endif
46779  
46780         if (copy_from_user
46781             (chip->data_buffer, (void __user *) buf, in_size)) {
46782 @@ -979,9 +995,17 @@ ssize_t tpm_write(struct file *file, con
46783         }
46784  
46785         /* atomic tpm command send and result receive */
46786 +#ifndef CONFIG_XEN
46787         out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE);
46788 +#else
46789 +       out_size = tpm_transmit(chip, chip->data_buffer,
46790 +                               get_chip_buffersize(chip));
46791 +#endif
46792  
46793         atomic_set(&chip->data_pending, out_size);
46794 +#ifdef CONFIG_XEN
46795 +       atomic_set(&chip->data_position, 0);
46796 +#endif
46797         up(&chip->buffer_mutex);
46798  
46799         /* Set a timeout by which the reader must come claim the result */
46800 @@ -996,21 +1020,52 @@ ssize_t tpm_read(struct file *file, char
46801  {
46802         struct tpm_chip *chip = file->private_data;
46803         int ret_size;
46804 +#ifdef CONFIG_XEN
46805 +       int pos, pending = 0;
46806 +#endif
46807  
46808 +#ifndef CONFIG_XEN
46809         del_singleshot_timer_sync(&chip->user_read_timer);
46810         flush_scheduled_work();
46811 +#endif
46812         ret_size = atomic_read(&chip->data_pending);
46813 +#ifndef CONFIG_XEN
46814         atomic_set(&chip->data_pending, 0);
46815 +#endif
46816         if (ret_size > 0) {     /* relay data */
46817                 if (size < ret_size)
46818                         ret_size = size;
46819  
46820 +#ifdef CONFIG_XEN
46821 +               pos = atomic_read(&chip->data_position);
46822 +#endif
46823                 down(&chip->buffer_mutex);
46824 +#ifndef CONFIG_XEN
46825                 if (copy_to_user(buf, chip->data_buffer, ret_size))
46826 +#else
46827 +               if (copy_to_user(buf, &chip->data_buffer[pos], ret_size)) {
46828 +#endif
46829                         ret_size = -EFAULT;
46830 +#ifdef CONFIG_XEN
46831 +               } else {
46832 +                       pending = atomic_read(&chip->data_pending) - ret_size;
46833 +                       if ( pending ) {
46834 +                               atomic_set(&chip->data_pending, pending);
46835 +                               atomic_set(&chip->data_position,
46836 +                                          pos+ret_size);
46837 +                       }
46838 +               }
46839 +#endif
46840                 up(&chip->buffer_mutex);
46841         }
46842  
46843 +#ifdef CONFIG_XEN
46844 +       if ( ret_size <= 0 || pending == 0 ) {
46845 +               atomic_set(&chip->data_pending, 0);
46846 +               del_singleshot_timer_sync(&chip->user_read_timer);
46847 +               flush_scheduled_work();
46848 +       }
46849 +#endif
46850         return ret_size;
46851  }
46852  EXPORT_SYMBOL_GPL(tpm_read);
46853 diff -ruNp linux-2.6.19/drivers/char/tpm/tpm.h linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm.h
46854 --- linux-2.6.19/drivers/char/tpm/tpm.h 2006-11-29 21:57:37.000000000 +0000
46855 +++ linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm.h       2007-02-02 19:10:30.000000000 +0000
46856 @@ -61,6 +61,7 @@ struct tpm_vendor_specific {
46857         const u8 req_complete_mask;
46858         const u8 req_complete_val;
46859         const u8 req_canceled;
46860 +       u32 buffersize;
46861         void __iomem *iobase;           /* ioremapped address */
46862         unsigned long base;             /* TPM base address */
46863  
46864 @@ -94,6 +95,7 @@ struct tpm_chip {
46865         /* Data passed to and from the tpm via the read/write calls */
46866         u8 *data_buffer;
46867         atomic_t data_pending;
46868 +       atomic_t data_position;
46869         struct semaphore buffer_mutex;
46870  
46871         struct timer_list user_read_timer;      /* user needs to claim result */
46872 @@ -121,6 +123,11 @@ static inline void tpm_write_index(int b
46873         outb(value & 0xFF, base+1);
46874  }
46875  
46876 +static inline u32 get_chip_buffersize(struct tpm_chip *chip)
46877 +{
46878 +       return chip->vendor.buffersize;
46879 +}
46880 +
46881  extern void tpm_get_timeouts(struct tpm_chip *);
46882  extern void tpm_gen_interrupt(struct tpm_chip *);
46883  extern void tpm_continue_selftest(struct tpm_chip *);
46884 diff -ruNp linux-2.6.19/drivers/char/tpm/tpm_vtpm.c linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm_vtpm.c
46885 --- linux-2.6.19/drivers/char/tpm/tpm_vtpm.c    1970-01-01 00:00:00.000000000 +0000
46886 +++ linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm_vtpm.c  2007-02-02 19:10:30.000000000 +0000
46887 @@ -0,0 +1,547 @@
46888 +/*
46889 + * Copyright (C) 2006 IBM Corporation
46890 + *
46891 + * Authors:
46892 + * Stefan Berger <stefanb@us.ibm.com>
46893 + *
46894 + * Generic device driver part for device drivers in a virtualized
46895 + * environment.
46896 + *
46897 + * This program is free software; you can redistribute it and/or
46898 + * modify it under the terms of the GNU General Public License as
46899 + * published by the Free Software Foundation, version 2 of the
46900 + * License.
46901 + *
46902 + */
46903 +
46904 +#include <asm/uaccess.h>
46905 +#include <linux/list.h>
46906 +#include <linux/device.h>
46907 +#include <linux/interrupt.h>
46908 +#include <linux/platform_device.h>
46909 +#include "tpm.h"
46910 +#include "tpm_vtpm.h"
46911 +
46912 +/* read status bits */
46913 +enum {
46914 +       STATUS_BUSY = 0x01,
46915 +       STATUS_DATA_AVAIL = 0x02,
46916 +       STATUS_READY = 0x04
46917 +};
46918 +
46919 +struct transmission {
46920 +       struct list_head next;
46921 +
46922 +       unsigned char *request;
46923 +       size_t  request_len;
46924 +       size_t  request_buflen;
46925 +
46926 +       unsigned char *response;
46927 +       size_t  response_len;
46928 +       size_t  response_buflen;
46929 +
46930 +       unsigned int flags;
46931 +};
46932 +
46933 +enum {
46934 +       TRANSMISSION_FLAG_WAS_QUEUED = 0x1
46935 +};
46936 +
46937 +
46938 +enum {
46939 +       DATAEX_FLAG_QUEUED_ONLY = 0x1
46940 +};
46941 +
46942 +
46943 +/* local variables */
46944 +
46945 +/* local function prototypes */
46946 +static int _vtpm_send_queued(struct tpm_chip *chip);
46947 +
46948 +
46949 +/* =============================================================
46950 + * Some utility functions
46951 + * =============================================================
46952 + */
46953 +static void vtpm_state_init(struct vtpm_state *vtpms)
46954 +{
46955 +       vtpms->current_request = NULL;
46956 +       spin_lock_init(&vtpms->req_list_lock);
46957 +       init_waitqueue_head(&vtpms->req_wait_queue);
46958 +       INIT_LIST_HEAD(&vtpms->queued_requests);
46959 +
46960 +       vtpms->current_response = NULL;
46961 +       spin_lock_init(&vtpms->resp_list_lock);
46962 +       init_waitqueue_head(&vtpms->resp_wait_queue);
46963 +
46964 +       vtpms->disconnect_time = jiffies;
46965 +}
46966 +
46967 +
46968 +static inline struct transmission *transmission_alloc(void)
46969 +{
46970 +       return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
46971 +}
46972 +
46973 +static unsigned char *
46974 +transmission_set_req_buffer(struct transmission *t,
46975 +                            unsigned char *buffer, size_t len)
46976 +{
46977 +       if (t->request_buflen < len) {
46978 +               kfree(t->request);
46979 +               t->request = kmalloc(len, GFP_KERNEL);
46980 +               if (!t->request) {
46981 +                       t->request_buflen = 0;
46982 +                       return NULL;
46983 +               }
46984 +               t->request_buflen = len;
46985 +       }
46986 +
46987 +       memcpy(t->request, buffer, len);
46988 +       t->request_len = len;
46989 +
46990 +       return t->request;
46991 +}
46992 +
46993 +static unsigned char *
46994 +transmission_set_res_buffer(struct transmission *t,
46995 +                            const unsigned char *buffer, size_t len)
46996 +{
46997 +       if (t->response_buflen < len) {
46998 +               kfree(t->response);
46999 +               t->response = kmalloc(len, GFP_ATOMIC);
47000 +               if (!t->response) {
47001 +                       t->response_buflen = 0;
47002 +                       return NULL;
47003 +               }
47004 +               t->response_buflen = len;
47005 +       }
47006 +
47007 +       memcpy(t->response, buffer, len);
47008 +       t->response_len = len;
47009 +
47010 +       return t->response;
47011 +}
47012 +
47013 +static inline void transmission_free(struct transmission *t)
47014 +{
47015 +       kfree(t->request);
47016 +       kfree(t->response);
47017 +       kfree(t);
47018 +}
47019 +
47020 +/* =============================================================
47021 + * Interface with the lower layer driver
47022 + * =============================================================
47023 + */
47024 +/*
47025 + * Lower layer uses this function to make a response available.
47026 + */
47027 +int vtpm_vd_recv(const struct tpm_chip *chip,
47028 +                 const unsigned char *buffer, size_t count,
47029 +                 void *ptr)
47030 +{
47031 +       unsigned long flags;
47032 +       int ret_size = 0;
47033 +       struct transmission *t;
47034 +       struct vtpm_state *vtpms;
47035 +
47036 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
47037 +
47038 +       /*
47039 +        * The list with requests must contain one request
47040 +        * only and the element there must be the one that
47041 +        * was passed to me from the front-end.
47042 +        */
47043 +       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
47044 +       if (vtpms->current_request != ptr) {
47045 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47046 +               return 0;
47047 +       }
47048 +
47049 +       if ((t = vtpms->current_request)) {
47050 +               transmission_free(t);
47051 +               vtpms->current_request = NULL;
47052 +       }
47053 +
47054 +       t = transmission_alloc();
47055 +       if (t) {
47056 +               if (!transmission_set_res_buffer(t, buffer, count)) {
47057 +                       transmission_free(t);
47058 +                       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47059 +                       return -ENOMEM;
47060 +               }
47061 +               ret_size = count;
47062 +               vtpms->current_response = t;
47063 +               wake_up_interruptible(&vtpms->resp_wait_queue);
47064 +       }
47065 +       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47066 +
47067 +       return ret_size;
47068 +}
47069 +
47070 +
47071 +/*
47072 + * Lower layer indicates its status (connected/disconnected)
47073 + */
47074 +void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
47075 +{
47076 +       struct vtpm_state *vtpms;
47077 +
47078 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
47079 +
47080 +       vtpms->vd_status = vd_status;
47081 +       if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
47082 +               vtpms->disconnect_time = jiffies;
47083 +       }
47084 +}
47085 +
47086 +/* =============================================================
47087 + * Interface with the generic TPM driver
47088 + * =============================================================
47089 + */
47090 +static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
47091 +{
47092 +       int rc = 0;
47093 +       unsigned long flags;
47094 +       struct vtpm_state *vtpms;
47095 +
47096 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
47097 +
47098 +       /*
47099 +        * Check if the previous operation only queued the command
47100 +        * In this case there won't be a response, so I just
47101 +        * return from here and reset that flag. In any other
47102 +        * case I should receive a response from the back-end.
47103 +        */
47104 +       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
47105 +       if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
47106 +               vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
47107 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47108 +               /*
47109 +                * The first few commands (measurements) must be
47110 +                * queued since it might not be possible to talk to the
47111 +                * TPM, yet.
47112 +                * Return a response of up to 30 '0's.
47113 +                */
47114 +
47115 +               count = min_t(size_t, count, 30);
47116 +               memset(buf, 0x0, count);
47117 +               return count;
47118 +       }
47119 +       /*
47120 +        * Check whether something is in the responselist and if
47121 +        * there's nothing in the list wait for something to appear.
47122 +        */
47123 +
47124 +       if (!vtpms->current_response) {
47125 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47126 +               interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
47127 +                                              1000);
47128 +               spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
47129 +       }
47130 +
47131 +       if (vtpms->current_response) {
47132 +               struct transmission *t = vtpms->current_response;
47133 +               vtpms->current_response = NULL;
47134 +               rc = min(count, t->response_len);
47135 +               memcpy(buf, t->response, rc);
47136 +               transmission_free(t);
47137 +       }
47138 +
47139 +       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47140 +       return rc;
47141 +}
47142 +
47143 +static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
47144 +{
47145 +       int rc = 0;
47146 +       unsigned long flags;
47147 +       struct transmission *t = transmission_alloc();
47148 +       struct vtpm_state *vtpms;
47149 +
47150 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
47151 +
47152 +       if (!t)
47153 +               return -ENOMEM;
47154 +       /*
47155 +        * If there's a current request, it must be the
47156 +        * previous request that has timed out.
47157 +        */
47158 +       spin_lock_irqsave(&vtpms->req_list_lock, flags);
47159 +       if (vtpms->current_request != NULL) {
47160 +               printk("WARNING: Sending although there is a request outstanding.\n"
47161 +                      "         Previous request must have timed out.\n");
47162 +               transmission_free(vtpms->current_request);
47163 +               vtpms->current_request = NULL;
47164 +       }
47165 +       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
47166 +
47167 +       /*
47168 +        * Queue the packet if the driver below is not
47169 +        * ready, yet, or there is any packet already
47170 +        * in the queue.
47171 +        * If the driver below is ready, unqueue all
47172 +        * packets first before sending our current
47173 +        * packet.
47174 +        * For each unqueued packet, except for the
47175 +        * last (=current) packet, call the function
47176 +        * tpm_xen_recv to wait for the response to come
47177 +        * back.
47178 +        */
47179 +       if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
47180 +               if (time_after(jiffies,
47181 +                              vtpms->disconnect_time + HZ * 10)) {
47182 +                       rc = -ENOENT;
47183 +               } else {
47184 +                       goto queue_it;
47185 +               }
47186 +       } else {
47187 +               /*
47188 +                * Send all queued packets.
47189 +                */
47190 +               if (_vtpm_send_queued(chip) == 0) {
47191 +
47192 +                       vtpms->current_request = t;
47193 +
47194 +                       rc = vtpm_vd_send(vtpms->tpm_private,
47195 +                                         buf,
47196 +                                         count,
47197 +                                         t);
47198 +                       /*
47199 +                        * The generic TPM driver will call
47200 +                        * the function to receive the response.
47201 +                        */
47202 +                       if (rc < 0) {
47203 +                               vtpms->current_request = NULL;
47204 +                               goto queue_it;
47205 +                       }
47206 +               } else {
47207 +queue_it:
47208 +                       if (!transmission_set_req_buffer(t, buf, count)) {
47209 +                               transmission_free(t);
47210 +                               rc = -ENOMEM;
47211 +                               goto exit;
47212 +                       }
47213 +                       /*
47214 +                        * An error occurred. Don't event try
47215 +                        * to send the current request. Just
47216 +                        * queue it.
47217 +                        */
47218 +                       spin_lock_irqsave(&vtpms->req_list_lock, flags);
47219 +                       vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
47220 +                       list_add_tail(&t->next, &vtpms->queued_requests);
47221 +                       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
47222 +               }
47223 +       }
47224 +
47225 +exit:
47226 +       return rc;
47227 +}
47228 +
47229 +
47230 +/*
47231 + * Send all queued requests.
47232 + */
47233 +static int _vtpm_send_queued(struct tpm_chip *chip)
47234 +{
47235 +       int rc;
47236 +       int error = 0;
47237 +       long flags;
47238 +       unsigned char buffer[1];
47239 +       struct vtpm_state *vtpms;
47240 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
47241 +
47242 +       spin_lock_irqsave(&vtpms->req_list_lock, flags);
47243 +
47244 +       while (!list_empty(&vtpms->queued_requests)) {
47245 +               /*
47246 +                * Need to dequeue them.
47247 +                * Read the result into a dummy buffer.
47248 +                */
47249 +               struct transmission *qt = (struct transmission *)
47250 +                                         vtpms->queued_requests.next;
47251 +               list_del(&qt->next);
47252 +               vtpms->current_request = qt;
47253 +               spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
47254 +
47255 +               rc = vtpm_vd_send(vtpms->tpm_private,
47256 +                                 qt->request,
47257 +                                 qt->request_len,
47258 +                                 qt);
47259 +
47260 +               if (rc < 0) {
47261 +                       spin_lock_irqsave(&vtpms->req_list_lock, flags);
47262 +                       if ((qt = vtpms->current_request) != NULL) {
47263 +                               /*
47264 +                                * requeue it at the beginning
47265 +                                * of the list
47266 +                                */
47267 +                               list_add(&qt->next,
47268 +                                        &vtpms->queued_requests);
47269 +                       }
47270 +                       vtpms->current_request = NULL;
47271 +                       error = 1;
47272 +                       break;
47273 +               }
47274 +               /*
47275 +                * After this point qt is not valid anymore!
47276 +                * It is freed when the front-end is delivering
47277 +                * the data by calling tpm_recv
47278 +                */
47279 +               /*
47280 +                * Receive response into provided dummy buffer
47281 +                */
47282 +               rc = vtpm_recv(chip, buffer, sizeof(buffer));
47283 +               spin_lock_irqsave(&vtpms->req_list_lock, flags);
47284 +       }
47285 +
47286 +       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
47287 +
47288 +       return error;
47289 +}
47290 +
47291 +static void vtpm_cancel(struct tpm_chip *chip)
47292 +{
47293 +       unsigned long flags;
47294 +       struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
47295 +
47296 +       spin_lock_irqsave(&vtpms->resp_list_lock,flags);
47297 +
47298 +       if (!vtpms->current_response && vtpms->current_request) {
47299 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47300 +               interruptible_sleep_on(&vtpms->resp_wait_queue);
47301 +               spin_lock_irqsave(&vtpms->resp_list_lock,flags);
47302 +       }
47303 +
47304 +       if (vtpms->current_response) {
47305 +               struct transmission *t = vtpms->current_response;
47306 +               vtpms->current_response = NULL;
47307 +               transmission_free(t);
47308 +       }
47309 +
47310 +       spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
47311 +}
47312 +
47313 +static u8 vtpm_status(struct tpm_chip *chip)
47314 +{
47315 +       u8 rc = 0;
47316 +       unsigned long flags;
47317 +       struct vtpm_state *vtpms;
47318 +
47319 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
47320 +
47321 +       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
47322 +       /*
47323 +        * Data are available if:
47324 +        *  - there's a current response
47325 +        *  - the last packet was queued only (this is fake, but necessary to
47326 +        *      get the generic TPM layer to call the receive function.)
47327 +        */
47328 +       if (vtpms->current_response ||
47329 +           0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
47330 +               rc = STATUS_DATA_AVAIL;
47331 +       } else if (!vtpms->current_response && !vtpms->current_request) {
47332 +               rc = STATUS_READY;
47333 +       }
47334 +
47335 +       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
47336 +       return rc;
47337 +}
47338 +
47339 +static struct file_operations vtpm_ops = {
47340 +       .owner = THIS_MODULE,
47341 +       .llseek = no_llseek,
47342 +       .open = tpm_open,
47343 +       .read = tpm_read,
47344 +       .write = tpm_write,
47345 +       .release = tpm_release,
47346 +};
47347 +
47348 +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
47349 +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
47350 +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
47351 +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
47352 +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
47353 +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
47354 +                  NULL);
47355 +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
47356 +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
47357 +
47358 +static struct attribute *vtpm_attrs[] = {
47359 +       &dev_attr_pubek.attr,
47360 +       &dev_attr_pcrs.attr,
47361 +       &dev_attr_enabled.attr,
47362 +       &dev_attr_active.attr,
47363 +       &dev_attr_owned.attr,
47364 +       &dev_attr_temp_deactivated.attr,
47365 +       &dev_attr_caps.attr,
47366 +       &dev_attr_cancel.attr,
47367 +       NULL,
47368 +};
47369 +
47370 +static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
47371 +
47372 +#define TPM_LONG_TIMEOUT   (10 * 60 * HZ)
47373 +
47374 +static struct tpm_vendor_specific tpm_vtpm = {
47375 +       .recv = vtpm_recv,
47376 +       .send = vtpm_send,
47377 +       .cancel = vtpm_cancel,
47378 +       .status = vtpm_status,
47379 +       .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
47380 +       .req_complete_val  = STATUS_DATA_AVAIL,
47381 +       .req_canceled = STATUS_READY,
47382 +       .attr_group = &vtpm_attr_grp,
47383 +       .miscdev = {
47384 +               .fops = &vtpm_ops,
47385 +       },
47386 +       .duration = {
47387 +               TPM_LONG_TIMEOUT,
47388 +               TPM_LONG_TIMEOUT,
47389 +               TPM_LONG_TIMEOUT,
47390 +       },
47391 +};
47392 +
47393 +struct tpm_chip *init_vtpm(struct device *dev,
47394 +                           struct tpm_virtual_device *tvd,
47395 +                           struct tpm_private *tp)
47396 +{
47397 +       long rc;
47398 +       struct tpm_chip *chip;
47399 +       struct vtpm_state *vtpms;
47400 +
47401 +       vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
47402 +       if (!vtpms)
47403 +               return ERR_PTR(-ENOMEM);
47404 +
47405 +       vtpm_state_init(vtpms);
47406 +       vtpms->tpmvd = tvd;
47407 +       vtpms->tpm_private = tp;
47408 +
47409 +       if (tvd)
47410 +               tpm_vtpm.buffersize = tvd->max_tx_size;
47411 +
47412 +       chip = tpm_register_hardware(dev, &tpm_vtpm);
47413 +       if (!chip) {
47414 +               rc = -ENODEV;
47415 +               goto err_free_mem;
47416 +       }
47417 +
47418 +       chip_set_private(chip, vtpms);
47419 +
47420 +       return chip;
47421 +
47422 +err_free_mem:
47423 +       kfree(vtpms);
47424 +
47425 +       return ERR_PTR(rc);
47426 +}
47427 +
47428 +void cleanup_vtpm(struct device *dev)
47429 +{
47430 +       struct tpm_chip *chip = dev_get_drvdata(dev);
47431 +       struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
47432 +       tpm_remove_hardware(dev);
47433 +       kfree(vtpms);
47434 +}
47435 diff -ruNp linux-2.6.19/drivers/char/tpm/tpm_vtpm.h linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm_vtpm.h
47436 --- linux-2.6.19/drivers/char/tpm/tpm_vtpm.h    1970-01-01 00:00:00.000000000 +0000
47437 +++ linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm_vtpm.h  2007-02-02 19:10:30.000000000 +0000
47438 @@ -0,0 +1,68 @@
47439 +#ifndef TPM_VTPM_H
47440 +#define TPM_VTPM_H
47441 +
47442 +struct tpm_chip;
47443 +struct tpm_private;
47444 +
47445 +struct tpm_virtual_device {
47446 +       /*
47447 +        * This field indicates the maximum size the driver can
47448 +        * transfer in one chunk. It is filled in by the front-end
47449 +        * driver and should be propagated to the generic tpm driver
47450 +        * for allocation of buffers.
47451 +        */
47452 +       unsigned int max_tx_size;
47453 +};
47454 +
47455 +struct vtpm_state {
47456 +       struct transmission *current_request;
47457 +       spinlock_t           req_list_lock;
47458 +       wait_queue_head_t    req_wait_queue;
47459 +
47460 +       struct list_head     queued_requests;
47461 +
47462 +       struct transmission *current_response;
47463 +       spinlock_t           resp_list_lock;
47464 +       wait_queue_head_t    resp_wait_queue;     // processes waiting for responses
47465 +
47466 +       u8                   vd_status;
47467 +       u8                   flags;
47468 +
47469 +       unsigned long        disconnect_time;
47470 +
47471 +       struct tpm_virtual_device *tpmvd;
47472 +
47473 +       /*
47474 +        * The following is a private structure of the underlying
47475 +        * driver. It is passed as parameter in the send function.
47476 +        */
47477 +       struct tpm_private *tpm_private;
47478 +};
47479 +
47480 +
47481 +enum vdev_status {
47482 +       TPM_VD_STATUS_DISCONNECTED = 0x0,
47483 +       TPM_VD_STATUS_CONNECTED = 0x1
47484 +};
47485 +
47486 +/* this function is called from tpm_vtpm.c */
47487 +int vtpm_vd_send(struct tpm_private * tp,
47488 +                 const u8 * buf, size_t count, void *ptr);
47489 +
47490 +/* these functions are offered by tpm_vtpm.c */
47491 +struct tpm_chip *init_vtpm(struct device *,
47492 +                           struct tpm_virtual_device *,
47493 +                           struct tpm_private *);
47494 +void cleanup_vtpm(struct device *);
47495 +int vtpm_vd_recv(const struct tpm_chip* chip,
47496 +                 const unsigned char *buffer, size_t count, void *ptr);
47497 +void vtpm_vd_status(const struct tpm_chip *, u8 status);
47498 +
47499 +static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
47500 +{
47501 +       struct tpm_chip *chip = dev_get_drvdata(dev);
47502 +       struct vtpm_state *vtpms = chip_get_private(chip);
47503 +       return vtpms->tpm_private;
47504 +}
47505 +
47506 +#endif
47507 diff -ruNp linux-2.6.19/drivers/char/tpm/tpm_xen.c linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm_xen.c
47508 --- linux-2.6.19/drivers/char/tpm/tpm_xen.c     1970-01-01 00:00:00.000000000 +0000
47509 +++ linux-2.6.19-xen-3.0.4/drivers/char/tpm/tpm_xen.c   2007-02-02 19:10:30.000000000 +0000
47510 @@ -0,0 +1,760 @@
47511 +/*
47512 + * Copyright (c) 2005, IBM Corporation
47513 + *
47514 + * Author: Stefan Berger, stefanb@us.ibm.com
47515 + * Grant table support: Mahadevan Gomathisankaran
47516 + *
47517 + * This code has been derived from drivers/xen/netfront/netfront.c
47518 + *
47519 + * Copyright (c) 2002-2004, K A Fraser
47520 + *
47521 + * This program is free software; you can redistribute it and/or
47522 + * modify it under the terms of the GNU General Public License version 2
47523 + * as published by the Free Software Foundation; or, when distributed
47524 + * separately from the Linux kernel or incorporated into other
47525 + * software packages, subject to the following license:
47526 + *
47527 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47528 + * of this source file (the "Software"), to deal in the Software without
47529 + * restriction, including without limitation the rights to use, copy, modify,
47530 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47531 + * and to permit persons to whom the Software is furnished to do so, subject to
47532 + * the following conditions:
47533 + *
47534 + * The above copyright notice and this permission notice shall be included in
47535 + * all copies or substantial portions of the Software.
47536 + *
47537 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47538 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47539 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47540 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47541 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47542 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47543 + * IN THE SOFTWARE.
47544 + */
47545 +
47546 +#include <linux/errno.h>
47547 +#include <linux/err.h>
47548 +#include <linux/interrupt.h>
47549 +#include <linux/mutex.h>
47550 +#include <asm/uaccess.h>
47551 +#include <xen/evtchn.h>
47552 +#include <xen/interface/grant_table.h>
47553 +#include <xen/interface/io/tpmif.h>
47554 +#include <xen/gnttab.h>
47555 +#include <xen/xenbus.h>
47556 +#include "tpm.h"
47557 +#include "tpm_vtpm.h"
47558 +
47559 +#undef DEBUG
47560 +
47561 +/* local structures */
47562 +struct tpm_private {
47563 +       struct tpm_chip *chip;
47564 +
47565 +       tpmif_tx_interface_t *tx;
47566 +       atomic_t refcnt;
47567 +       unsigned int evtchn;
47568 +       unsigned int irq;
47569 +       u8 is_connected;
47570 +       u8 is_suspended;
47571 +
47572 +       spinlock_t tx_lock;
47573 +
47574 +       struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
47575 +
47576 +       atomic_t tx_busy;
47577 +       void *tx_remember;
47578 +
47579 +       domid_t backend_id;
47580 +       wait_queue_head_t wait_q;
47581 +
47582 +       struct xenbus_device *dev;
47583 +       int ring_ref;
47584 +};
47585 +
47586 +struct tx_buffer {
47587 +       unsigned int size;      // available space in data
47588 +       unsigned int len;       // used space in data
47589 +       unsigned char *data;    // pointer to a page
47590 +};
47591 +
47592 +
47593 +/* locally visible variables */
47594 +static grant_ref_t gref_head;
47595 +static struct tpm_private *my_priv;
47596 +
47597 +/* local function prototypes */
47598 +static irqreturn_t tpmif_int(int irq,
47599 +                             void *tpm_priv,
47600 +                             struct pt_regs *ptregs);
47601 +static void tpmif_rx_action(unsigned long unused);
47602 +static int tpmif_connect(struct xenbus_device *dev,
47603 +                         struct tpm_private *tp,
47604 +                         domid_t domid);
47605 +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
47606 +static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
47607 +static void tpmif_free_tx_buffers(struct tpm_private *tp);
47608 +static void tpmif_set_connected_state(struct tpm_private *tp,
47609 +                                      u8 newstate);
47610 +static int tpm_xmit(struct tpm_private *tp,
47611 +                    const u8 * buf, size_t count, int userbuffer,
47612 +                    void *remember);
47613 +static void destroy_tpmring(struct tpm_private *tp);
47614 +void __exit tpmif_exit(void);
47615 +
47616 +#define DPRINTK(fmt, args...) \
47617 +    pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
47618 +#define IPRINTK(fmt, args...) \
47619 +    printk(KERN_INFO "xen_tpm_fr: " fmt, ##args)
47620 +#define WPRINTK(fmt, args...) \
47621 +    printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args)
47622 +
47623 +#define GRANT_INVALID_REF      0
47624 +
47625 +
47626 +static inline int
47627 +tx_buffer_copy(struct tx_buffer *txb, const u8 * src, int len,
47628 +               int isuserbuffer)
47629 +{
47630 +       int copied = len;
47631 +
47632 +       if (len > txb->size) {
47633 +               copied = txb->size;
47634 +       }
47635 +       if (isuserbuffer) {
47636 +               if (copy_from_user(txb->data, src, copied))
47637 +                       return -EFAULT;
47638 +       } else {
47639 +               memcpy(txb->data, src, copied);
47640 +       }
47641 +       txb->len = len;
47642 +       return copied;
47643 +}
47644 +
47645 +static inline struct tx_buffer *tx_buffer_alloc(void)
47646 +{
47647 +       struct tx_buffer *txb = kzalloc(sizeof (struct tx_buffer),
47648 +                                       GFP_KERNEL);
47649 +
47650 +       if (txb) {
47651 +               txb->len = 0;
47652 +               txb->size = PAGE_SIZE;
47653 +               txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
47654 +               if (txb->data == NULL) {
47655 +                       kfree(txb);
47656 +                       txb = NULL;
47657 +               }
47658 +       }
47659 +       return txb;
47660 +}
47661 +
47662 +
47663 +static inline void tx_buffer_free(struct tx_buffer *txb)
47664 +{
47665 +       if (txb) {
47666 +               free_page((long)txb->data);
47667 +               kfree(txb);
47668 +       }
47669 +}
47670 +
47671 +/**************************************************************
47672 + Utility function for the tpm_private structure
47673 +**************************************************************/
47674 +static inline void tpm_private_init(struct tpm_private *tp)
47675 +{
47676 +       spin_lock_init(&tp->tx_lock);
47677 +       init_waitqueue_head(&tp->wait_q);
47678 +       atomic_set(&tp->refcnt, 1);
47679 +}
47680 +
47681 +static inline void tpm_private_put(void)
47682 +{
47683 +       if ( atomic_dec_and_test(&my_priv->refcnt)) {
47684 +               tpmif_free_tx_buffers(my_priv);
47685 +               kfree(my_priv);
47686 +               my_priv = NULL;
47687 +       }
47688 +}
47689 +
47690 +static struct tpm_private *tpm_private_get(void)
47691 +{
47692 +       int err;
47693 +       if (!my_priv) {
47694 +               my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
47695 +               if (my_priv) {
47696 +                       tpm_private_init(my_priv);
47697 +                       err = tpmif_allocate_tx_buffers(my_priv);
47698 +                       if (err < 0) {
47699 +                               tpm_private_put();
47700 +                       }
47701 +               }
47702 +       } else {
47703 +               atomic_inc(&my_priv->refcnt);
47704 +       }
47705 +       return my_priv;
47706 +}
47707 +
47708 +/**************************************************************
47709 +
47710 + The interface to let the tpm plugin register its callback
47711 + function and send data to another partition using this module
47712 +
47713 +**************************************************************/
47714 +
47715 +static DEFINE_MUTEX(suspend_lock);
47716 +/*
47717 + * Send data via this module by calling this function
47718 + */
47719 +int vtpm_vd_send(struct tpm_private *tp,
47720 +                 const u8 * buf, size_t count, void *ptr)
47721 +{
47722 +       int sent;
47723 +
47724 +       mutex_lock(&suspend_lock);
47725 +       sent = tpm_xmit(tp, buf, count, 0, ptr);
47726 +       mutex_unlock(&suspend_lock);
47727 +
47728 +       return sent;
47729 +}
47730 +
47731 +/**************************************************************
47732 + XENBUS support code
47733 +**************************************************************/
47734 +
47735 +static int setup_tpmring(struct xenbus_device *dev,
47736 +                         struct tpm_private *tp)
47737 +{
47738 +       tpmif_tx_interface_t *sring;
47739 +       int err;
47740 +
47741 +       tp->ring_ref = GRANT_INVALID_REF;
47742 +
47743 +       sring = (void *)__get_free_page(GFP_KERNEL);
47744 +       if (!sring) {
47745 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
47746 +               return -ENOMEM;
47747 +       }
47748 +       tp->tx = sring;
47749 +
47750 +       err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
47751 +       if (err < 0) {
47752 +               free_page((unsigned long)sring);
47753 +               tp->tx = NULL;
47754 +               xenbus_dev_fatal(dev, err, "allocating grant reference");
47755 +               goto fail;
47756 +       }
47757 +       tp->ring_ref = err;
47758 +
47759 +       err = tpmif_connect(dev, tp, dev->otherend_id);
47760 +       if (err)
47761 +               goto fail;
47762 +
47763 +       return 0;
47764 +fail:
47765 +       destroy_tpmring(tp);
47766 +       return err;
47767 +}
47768 +
47769 +
47770 +static void destroy_tpmring(struct tpm_private *tp)
47771 +{
47772 +       tpmif_set_connected_state(tp, 0);
47773 +
47774 +       if (tp->ring_ref != GRANT_INVALID_REF) {
47775 +               gnttab_end_foreign_access(tp->ring_ref, 0,
47776 +                                         (unsigned long)tp->tx);
47777 +               tp->ring_ref = GRANT_INVALID_REF;
47778 +               tp->tx = NULL;
47779 +       }
47780 +
47781 +       if (tp->irq)
47782 +               unbind_from_irqhandler(tp->irq, tp);
47783 +
47784 +       tp->evtchn = tp->irq = 0;
47785 +}
47786 +
47787 +
47788 +static int talk_to_backend(struct xenbus_device *dev,
47789 +                           struct tpm_private *tp)
47790 +{
47791 +       const char *message = NULL;
47792 +       int err;
47793 +       struct xenbus_transaction xbt;
47794 +
47795 +       err = setup_tpmring(dev, tp);
47796 +       if (err) {
47797 +               xenbus_dev_fatal(dev, err, "setting up ring");
47798 +               goto out;
47799 +       }
47800 +
47801 +again:
47802 +       err = xenbus_transaction_start(&xbt);
47803 +       if (err) {
47804 +               xenbus_dev_fatal(dev, err, "starting transaction");
47805 +               goto destroy_tpmring;
47806 +       }
47807 +
47808 +       err = xenbus_printf(xbt, dev->nodename,
47809 +                           "ring-ref","%u", tp->ring_ref);
47810 +       if (err) {
47811 +               message = "writing ring-ref";
47812 +               goto abort_transaction;
47813 +       }
47814 +
47815 +       err = xenbus_printf(xbt, dev->nodename,
47816 +                           "event-channel", "%u", tp->evtchn);
47817 +       if (err) {
47818 +               message = "writing event-channel";
47819 +               goto abort_transaction;
47820 +       }
47821 +
47822 +       err = xenbus_transaction_end(xbt, 0);
47823 +       if (err == -EAGAIN)
47824 +               goto again;
47825 +       if (err) {
47826 +               xenbus_dev_fatal(dev, err, "completing transaction");
47827 +               goto destroy_tpmring;
47828 +       }
47829 +
47830 +       xenbus_switch_state(dev, XenbusStateConnected);
47831 +
47832 +       return 0;
47833 +
47834 +abort_transaction:
47835 +       xenbus_transaction_end(xbt, 1);
47836 +       if (message)
47837 +               xenbus_dev_error(dev, err, "%s", message);
47838 +destroy_tpmring:
47839 +       destroy_tpmring(tp);
47840 +out:
47841 +       return err;
47842 +}
47843 +
47844 +/**
47845 + * Callback received when the backend's state changes.
47846 + */
47847 +static void backend_changed(struct xenbus_device *dev,
47848 +                           enum xenbus_state backend_state)
47849 +{
47850 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
47851 +       DPRINTK("\n");
47852 +
47853 +       switch (backend_state) {
47854 +       case XenbusStateInitialising:
47855 +       case XenbusStateInitWait:
47856 +       case XenbusStateInitialised:
47857 +       case XenbusStateUnknown:
47858 +               break;
47859 +
47860 +       case XenbusStateConnected:
47861 +               tpmif_set_connected_state(tp, 1);
47862 +               break;
47863 +
47864 +       case XenbusStateClosing:
47865 +               tpmif_set_connected_state(tp, 0);
47866 +               xenbus_frontend_closed(dev);
47867 +               break;
47868 +
47869 +       case XenbusStateClosed:
47870 +               tpmif_set_connected_state(tp, 0);
47871 +               if (tp->is_suspended == 0)
47872 +                       device_unregister(&dev->dev);
47873 +               xenbus_frontend_closed(dev);
47874 +               break;
47875 +       }
47876 +}
47877 +
47878 +struct tpm_virtual_device tvd = {
47879 +       .max_tx_size = PAGE_SIZE * TPMIF_TX_RING_SIZE,
47880 +};
47881 +
47882 +static int tpmfront_probe(struct xenbus_device *dev,
47883 +                          const struct xenbus_device_id *id)
47884 +{
47885 +       int err;
47886 +       int handle;
47887 +       struct tpm_private *tp = tpm_private_get();
47888 +
47889 +       if (!tp)
47890 +               return -ENOMEM;
47891 +
47892 +       tp->chip = init_vtpm(&dev->dev, &tvd, tp);
47893 +
47894 +       if (IS_ERR(tp->chip)) {
47895 +               return PTR_ERR(tp->chip);
47896 +       }
47897 +
47898 +       err = xenbus_scanf(XBT_NIL, dev->nodename,
47899 +                          "handle", "%i", &handle);
47900 +       if (XENBUS_EXIST_ERR(err))
47901 +               return err;
47902 +
47903 +       if (err < 0) {
47904 +               xenbus_dev_fatal(dev,err,"reading virtual-device");
47905 +               return err;
47906 +       }
47907 +
47908 +       tp->dev = dev;
47909 +
47910 +       err = talk_to_backend(dev, tp);
47911 +       if (err) {
47912 +               tpm_private_put();
47913 +               return err;
47914 +       }
47915 +       return 0;
47916 +}
47917 +
47918 +
47919 +static int tpmfront_remove(struct xenbus_device *dev)
47920 +{
47921 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
47922 +       destroy_tpmring(tp);
47923 +       cleanup_vtpm(&dev->dev);
47924 +       return 0;
47925 +}
47926 +
47927 +static int tpmfront_suspend(struct xenbus_device *dev)
47928 +{
47929 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
47930 +       u32 ctr;
47931 +       /* lock, so no app can send */
47932 +       mutex_lock(&suspend_lock);
47933 +       tp->is_suspended = 1;
47934 +
47935 +       for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 300; ctr++) {
47936 +               if ((ctr % 10) == 0)
47937 +                       printk("TPM-FE [INFO]: Waiting for outstanding "
47938 +                              "request.\n");
47939 +               /*
47940 +                * Wait for a request to be responded to.
47941 +                */
47942 +               interruptible_sleep_on_timeout(&tp->wait_q, 100);
47943 +       }
47944 +       xenbus_switch_state(dev, XenbusStateClosing);
47945 +
47946 +       if (atomic_read(&tp->tx_busy)) {
47947 +               /*
47948 +                * A temporary work-around.
47949 +                */
47950 +               printk("TPM-FE [WARNING]: Resetting busy flag.");
47951 +               atomic_set(&tp->tx_busy, 0);
47952 +       }
47953 +
47954 +       return 0;
47955 +}
47956 +
47957 +static int tpmfront_resume(struct xenbus_device *dev)
47958 +{
47959 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
47960 +       destroy_tpmring(tp);
47961 +       return talk_to_backend(dev, tp);
47962 +}
47963 +
47964 +static int tpmif_connect(struct xenbus_device *dev,
47965 +                         struct tpm_private *tp,
47966 +                         domid_t domid)
47967 +{
47968 +       int err;
47969 +
47970 +       tp->backend_id = domid;
47971 +
47972 +       err = xenbus_alloc_evtchn(dev, &tp->evtchn);
47973 +       if (err)
47974 +               return err;
47975 +
47976 +       err = bind_evtchn_to_irqhandler(tp->evtchn,
47977 +                                       tpmif_int, SA_SAMPLE_RANDOM, "tpmif",
47978 +                                       tp);
47979 +       if (err <= 0) {
47980 +               WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
47981 +               return err;
47982 +       }
47983 +
47984 +       tp->irq = err;
47985 +       return 0;
47986 +}
47987 +
47988 +static struct xenbus_device_id tpmfront_ids[] = {
47989 +       { "vtpm" },
47990 +       { "" }
47991 +};
47992 +
47993 +static struct xenbus_driver tpmfront = {
47994 +       .name = "vtpm",
47995 +       .owner = THIS_MODULE,
47996 +       .ids = tpmfront_ids,
47997 +       .probe = tpmfront_probe,
47998 +       .remove =  tpmfront_remove,
47999 +       .resume = tpmfront_resume,
48000 +       .otherend_changed = backend_changed,
48001 +       .suspend = tpmfront_suspend,
48002 +};
48003 +
48004 +static void __init init_tpm_xenbus(void)
48005 +{
48006 +       xenbus_register_frontend(&tpmfront);
48007 +}
48008 +
48009 +static void __exit exit_tpm_xenbus(void)
48010 +{
48011 +       xenbus_unregister_driver(&tpmfront);
48012 +}
48013 +
48014 +static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
48015 +{
48016 +       unsigned int i;
48017 +
48018 +       for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
48019 +               tp->tx_buffers[i] = tx_buffer_alloc();
48020 +               if (!tp->tx_buffers[i]) {
48021 +                       tpmif_free_tx_buffers(tp);
48022 +                       return -ENOMEM;
48023 +               }
48024 +       }
48025 +       return 0;
48026 +}
48027 +
48028 +static void tpmif_free_tx_buffers(struct tpm_private *tp)
48029 +{
48030 +       unsigned int i;
48031 +
48032 +       for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
48033 +               tx_buffer_free(tp->tx_buffers[i]);
48034 +       }
48035 +}
48036 +
48037 +static void tpmif_rx_action(unsigned long priv)
48038 +{
48039 +       struct tpm_private *tp = (struct tpm_private *)priv;
48040 +
48041 +       int i = 0;
48042 +       unsigned int received;
48043 +       unsigned int offset = 0;
48044 +       u8 *buffer;
48045 +       tpmif_tx_request_t *tx;
48046 +       tx = &tp->tx->ring[i].req;
48047 +
48048 +       atomic_set(&tp->tx_busy, 0);
48049 +       wake_up_interruptible(&tp->wait_q);
48050 +
48051 +       received = tx->size;
48052 +
48053 +       buffer = kmalloc(received, GFP_ATOMIC);
48054 +       if (NULL == buffer) {
48055 +               goto exit;
48056 +       }
48057 +
48058 +       for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
48059 +               struct tx_buffer *txb = tp->tx_buffers[i];
48060 +               tpmif_tx_request_t *tx;
48061 +               unsigned int tocopy;
48062 +
48063 +               tx = &tp->tx->ring[i].req;
48064 +               tocopy = tx->size;
48065 +               if (tocopy > PAGE_SIZE) {
48066 +                       tocopy = PAGE_SIZE;
48067 +               }
48068 +
48069 +               memcpy(&buffer[offset], txb->data, tocopy);
48070 +
48071 +               gnttab_release_grant_reference(&gref_head, tx->ref);
48072 +
48073 +               offset += tocopy;
48074 +       }
48075 +
48076 +       vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
48077 +       kfree(buffer);
48078 +
48079 +exit:
48080 +
48081 +       return;
48082 +}
48083 +
48084 +
48085 +static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
48086 +{
48087 +       struct tpm_private *tp = tpm_priv;
48088 +       unsigned long flags;
48089 +
48090 +       spin_lock_irqsave(&tp->tx_lock, flags);
48091 +       tpmif_rx_tasklet.data = (unsigned long)tp;
48092 +       tasklet_schedule(&tpmif_rx_tasklet);
48093 +       spin_unlock_irqrestore(&tp->tx_lock, flags);
48094 +
48095 +       return IRQ_HANDLED;
48096 +}
48097 +
48098 +
48099 +static int tpm_xmit(struct tpm_private *tp,
48100 +                    const u8 * buf, size_t count, int isuserbuffer,
48101 +                    void *remember)
48102 +{
48103 +       tpmif_tx_request_t *tx;
48104 +       TPMIF_RING_IDX i;
48105 +       unsigned int offset = 0;
48106 +
48107 +       spin_lock_irq(&tp->tx_lock);
48108 +
48109 +       if (unlikely(atomic_read(&tp->tx_busy))) {
48110 +               printk("tpm_xmit: There's an outstanding request/response "
48111 +                      "on the way!\n");
48112 +               spin_unlock_irq(&tp->tx_lock);
48113 +               return -EBUSY;
48114 +       }
48115 +
48116 +       if (tp->is_connected != 1) {
48117 +               spin_unlock_irq(&tp->tx_lock);
48118 +               return -EIO;
48119 +       }
48120 +
48121 +       for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
48122 +               struct tx_buffer *txb = tp->tx_buffers[i];
48123 +               int copied;
48124 +
48125 +               if (NULL == txb) {
48126 +                       DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
48127 +                               "Not transmitting anything!\n", i);
48128 +                       spin_unlock_irq(&tp->tx_lock);
48129 +                       return -EFAULT;
48130 +               }
48131 +               copied = tx_buffer_copy(txb, &buf[offset], count,
48132 +                                       isuserbuffer);
48133 +               if (copied < 0) {
48134 +                       /* An error occurred */
48135 +                       spin_unlock_irq(&tp->tx_lock);
48136 +                       return copied;
48137 +               }
48138 +               count -= copied;
48139 +               offset += copied;
48140 +
48141 +               tx = &tp->tx->ring[i].req;
48142 +
48143 +               tx->addr = virt_to_machine(txb->data);
48144 +               tx->size = txb->len;
48145 +
48146 +               DPRINTK("First 4 characters sent by TPM-FE are 0x%02x 0x%02x 0x%02x 0x%02x\n",
48147 +                       txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
48148 +
48149 +               /* get the granttable reference for this page */
48150 +               tx->ref = gnttab_claim_grant_reference(&gref_head);
48151 +
48152 +               if (-ENOSPC == tx->ref) {
48153 +                       spin_unlock_irq(&tp->tx_lock);
48154 +                       DPRINTK(" Grant table claim reference failed in func:%s line:%d file:%s\n", __FUNCTION__, __LINE__, __FILE__);
48155 +                       return -ENOSPC;
48156 +               }
48157 +               gnttab_grant_foreign_access_ref( tx->ref,
48158 +                                                tp->backend_id,
48159 +                                                (tx->addr >> PAGE_SHIFT),
48160 +                                                0 /*RW*/);
48161 +               wmb();
48162 +       }
48163 +
48164 +       atomic_set(&tp->tx_busy, 1);
48165 +       tp->tx_remember = remember;
48166 +
48167 +       mb();
48168 +
48169 +       DPRINTK("Notifying backend via event channel %d\n",
48170 +               tp->evtchn);
48171 +
48172 +       notify_remote_via_irq(tp->irq);
48173 +
48174 +       spin_unlock_irq(&tp->tx_lock);
48175 +       return offset;
48176 +}
48177 +
48178 +
48179 +static void tpmif_notify_upperlayer(struct tpm_private *tp)
48180 +{
48181 +       /*
48182 +        * Notify upper layer about the state of the connection
48183 +        * to the BE.
48184 +        */
48185 +       if (tp->is_connected) {
48186 +               vtpm_vd_status(tp->chip, TPM_VD_STATUS_CONNECTED);
48187 +       } else {
48188 +               vtpm_vd_status(tp->chip, TPM_VD_STATUS_DISCONNECTED);
48189 +       }
48190 +}
48191 +
48192 +
48193 +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
48194 +{
48195 +       /*
48196 +        * Don't notify upper layer if we are in suspend mode and
48197 +        * should disconnect - assumption is that we will resume
48198 +        * The mutex keeps apps from sending.
48199 +        */
48200 +       if (is_connected == 0 && tp->is_suspended == 1) {
48201 +               return;
48202 +       }
48203 +
48204 +       /*
48205 +        * Unlock the mutex if we are connected again
48206 +        * after being suspended - now resuming.
48207 +        * This also removes the suspend state.
48208 +        */
48209 +       if (is_connected == 1 && tp->is_suspended == 1) {
48210 +               tp->is_suspended = 0;
48211 +               /* unlock, so apps can resume sending */
48212 +               mutex_unlock(&suspend_lock);
48213 +       }
48214 +
48215 +       if (is_connected != tp->is_connected) {
48216 +               tp->is_connected = is_connected;
48217 +               tpmif_notify_upperlayer(tp);
48218 +       }
48219 +}
48220 +
48221 +
48222 +
48223 +/* =================================================================
48224 + * Initialization function.
48225 + * =================================================================
48226 + */
48227 +
48228 +
48229 +static int __init tpmif_init(void)
48230 +{
48231 +       long rc = 0;
48232 +       struct tpm_private *tp;
48233 +
48234 +       if (is_initial_xendomain())
48235 +               return -EPERM;
48236 +
48237 +       tp = tpm_private_get();
48238 +       if (!tp) {
48239 +               rc = -ENOMEM;
48240 +               goto failexit;
48241 +       }
48242 +
48243 +       IPRINTK("Initialising the vTPM driver.\n");
48244 +       if ( gnttab_alloc_grant_references ( TPMIF_TX_RING_SIZE,
48245 +                                            &gref_head ) < 0) {
48246 +               rc = -EFAULT;
48247 +               goto gnttab_alloc_failed;
48248 +       }
48249 +
48250 +       init_tpm_xenbus();
48251 +       return 0;
48252 +
48253 +gnttab_alloc_failed:
48254 +       tpm_private_put();
48255 +failexit:
48256 +
48257 +       return (int)rc;
48258 +}
48259 +
48260 +
48261 +void __exit tpmif_exit(void)
48262 +{
48263 +       exit_tpm_xenbus();
48264 +       tpm_private_put();
48265 +       gnttab_free_grant_references(gref_head);
48266 +}
48267 +
48268 +module_init(tpmif_init);
48269 +
48270 +MODULE_LICENSE("Dual BSD/GPL");
48271 diff -ruNp linux-2.6.19/drivers/char/tty_io.c linux-2.6.19-xen-3.0.4/drivers/char/tty_io.c
48272 --- linux-2.6.19/drivers/char/tty_io.c  2006-11-29 21:57:37.000000000 +0000
48273 +++ linux-2.6.19-xen-3.0.4/drivers/char/tty_io.c        2007-02-02 19:10:30.000000000 +0000
48274 @@ -131,6 +131,8 @@ LIST_HEAD(tty_drivers);                     /* linked list
48275  DEFINE_MUTEX(tty_mutex);
48276  EXPORT_SYMBOL(tty_mutex);
48277  
48278 +int console_use_vt = 1;
48279 +
48280  #ifdef CONFIG_UNIX98_PTYS
48281  extern struct tty_driver *ptm_driver;  /* Unix98 pty masters; for /dev/ptmx */
48282  extern int pty_limit;          /* Config limit on Unix98 ptys */
48283 @@ -2477,7 +2479,7 @@ retry_open:
48284                 goto got_driver;
48285         }
48286  #ifdef CONFIG_VT
48287 -       if (device == MKDEV(TTY_MAJOR,0)) {
48288 +       if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) {
48289                 extern struct tty_driver *console_driver;
48290                 driver = console_driver;
48291                 index = fg_console;
48292 @@ -3912,6 +3914,8 @@ static int __init tty_init(void)
48293  #endif
48294  
48295  #ifdef CONFIG_VT
48296 +       if (!console_use_vt)
48297 +               goto out_vt;
48298         cdev_init(&vc0_cdev, &console_fops);
48299         if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
48300             register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
48301 @@ -3919,6 +3923,7 @@ static int __init tty_init(void)
48302         class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
48303  
48304         vty_init();
48305 + out_vt:
48306  #endif
48307         return 0;
48308  }
48309 diff -ruNp linux-2.6.19/drivers/firmware/Kconfig linux-2.6.19-xen-3.0.4/drivers/firmware/Kconfig
48310 --- linux-2.6.19/drivers/firmware/Kconfig       2006-11-29 21:57:37.000000000 +0000
48311 +++ linux-2.6.19-xen-3.0.4/drivers/firmware/Kconfig     2007-02-02 19:10:30.000000000 +0000
48312 @@ -7,7 +7,7 @@ menu "Firmware Drivers"
48313  
48314  config EDD
48315         tristate "BIOS Enhanced Disk Drive calls determine boot disk"
48316 -       depends on !IA64
48317 +       depends on !IA64 && !XEN
48318         help
48319           Say Y or M here if you want to enable BIOS Enhanced Disk Drive
48320           Services real mode BIOS calls to determine which disk
48321 diff -ruNp linux-2.6.19/drivers/ide/ide-lib.c linux-2.6.19-xen-3.0.4/drivers/ide/ide-lib.c
48322 --- linux-2.6.19/drivers/ide/ide-lib.c  2006-11-29 21:57:37.000000000 +0000
48323 +++ linux-2.6.19-xen-3.0.4/drivers/ide/ide-lib.c        2007-02-02 19:10:31.000000000 +0000
48324 @@ -429,10 +429,10 @@ void ide_toggle_bounce(ide_drive_t *driv
48325  {
48326         u64 addr = BLK_BOUNCE_HIGH;     /* dma64_addr_t */
48327  
48328 -       if (!PCI_DMA_BUS_IS_PHYS) {
48329 -               addr = BLK_BOUNCE_ANY;
48330 -       } else if (on && drive->media == ide_disk) {
48331 -               if (HWIF(drive)->pci_dev)
48332 +       if (on && drive->media == ide_disk) {
48333 +               if (!PCI_DMA_BUS_IS_PHYS)
48334 +                       addr = BLK_BOUNCE_ANY;
48335 +               else if (HWIF(drive)->pci_dev)
48336                         addr = HWIF(drive)->pci_dev->dma_mask;
48337         }
48338  
48339 diff -ruNp linux-2.6.19/drivers/oprofile/buffer_sync.c linux-2.6.19-xen-3.0.4/drivers/oprofile/buffer_sync.c
48340 --- linux-2.6.19/drivers/oprofile/buffer_sync.c 2006-11-29 21:57:37.000000000 +0000
48341 +++ linux-2.6.19-xen-3.0.4/drivers/oprofile/buffer_sync.c       2007-02-02 19:10:36.000000000 +0000
48342 @@ -6,6 +6,10 @@
48343   *
48344   * @author John Levon <levon@movementarian.org>
48345   *
48346 + * Modified by Aravind Menon for Xen
48347 + * These modifications are:
48348 + * Copyright (C) 2005 Hewlett-Packard Co.
48349 + *
48350   * This is the core of the buffer management. Each
48351   * CPU buffer is processed and entered into the
48352   * global event buffer. Such processing is necessary
48353 @@ -275,15 +279,31 @@ static void add_cpu_switch(int i)
48354         last_cookie = INVALID_COOKIE;
48355  }
48356  
48357 -static void add_kernel_ctx_switch(unsigned int in_kernel)
48358 +static void add_cpu_mode_switch(unsigned int cpu_mode)
48359  {
48360         add_event_entry(ESCAPE_CODE);
48361 -       if (in_kernel)
48362 -               add_event_entry(KERNEL_ENTER_SWITCH_CODE); 
48363 -       else
48364 -               add_event_entry(KERNEL_EXIT_SWITCH_CODE); 
48365 +       switch (cpu_mode) {
48366 +       case CPU_MODE_USER:
48367 +               add_event_entry(USER_ENTER_SWITCH_CODE);
48368 +               break;
48369 +       case CPU_MODE_KERNEL:
48370 +               add_event_entry(KERNEL_ENTER_SWITCH_CODE);
48371 +               break;
48372 +       case CPU_MODE_XEN:
48373 +               add_event_entry(XEN_ENTER_SWITCH_CODE);
48374 +               break;
48375 +       default:
48376 +               break;
48377 +       }
48378  }
48379
48380 +
48381 +static void add_domain_switch(unsigned long domain_id)
48382 +{
48383 +       add_event_entry(ESCAPE_CODE);
48384 +       add_event_entry(DOMAIN_SWITCH_CODE);
48385 +       add_event_entry(domain_id);
48386 +}
48387 +
48388  static void
48389  add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
48390  {
48391 @@ -348,9 +368,9 @@ static int add_us_sample(struct mm_struc
48392   * for later lookup from userspace.
48393   */
48394  static int
48395 -add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
48396 +add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode)
48397  {
48398 -       if (in_kernel) {
48399 +       if (cpu_mode >= CPU_MODE_KERNEL) {
48400                 add_sample_entry(s->eip, s->event);
48401                 return 1;
48402         } else if (mm) {
48403 @@ -496,10 +516,11 @@ void sync_buffer(int cpu)
48404         struct mm_struct *mm = NULL;
48405         struct task_struct * new;
48406         unsigned long cookie = 0;
48407 -       int in_kernel = 1;
48408 +       int cpu_mode = 1;
48409         unsigned int i;
48410         sync_buffer_state state = sb_buffer_start;
48411         unsigned long available;
48412 +       int domain_switch = 0;
48413  
48414         mutex_lock(&buffer_mutex);
48415   
48416 @@ -512,16 +533,18 @@ void sync_buffer(int cpu)
48417         for (i = 0; i < available; ++i) {
48418                 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
48419   
48420 -               if (is_code(s->eip)) {
48421 -                       if (s->event <= CPU_IS_KERNEL) {
48422 -                               /* kernel/userspace switch */
48423 -                               in_kernel = s->event;
48424 +               if (is_code(s->eip) && !domain_switch) {
48425 +                       if (s->event <= CPU_MODE_XEN) {
48426 +                               /* xen/kernel/userspace switch */
48427 +                               cpu_mode = s->event;
48428                                 if (state == sb_buffer_start)
48429                                         state = sb_sample_start;
48430 -                               add_kernel_ctx_switch(s->event);
48431 +                               add_cpu_mode_switch(s->event);
48432                         } else if (s->event == CPU_TRACE_BEGIN) {
48433                                 state = sb_bt_start;
48434                                 add_trace_begin();
48435 +                       } else if (s->event == CPU_DOMAIN_SWITCH) {
48436 +                                       domain_switch = 1;                              
48437                         } else {
48438                                 struct mm_struct * oldmm = mm;
48439  
48440 @@ -535,11 +558,16 @@ void sync_buffer(int cpu)
48441                                 add_user_ctx_switch(new, cookie);
48442                         }
48443                 } else {
48444 -                       if (state >= sb_bt_start &&
48445 -                           !add_sample(mm, s, in_kernel)) {
48446 -                               if (state == sb_bt_start) {
48447 -                                       state = sb_bt_ignore;
48448 -                                       atomic_inc(&oprofile_stats.bt_lost_no_mapping);
48449 +                       if (domain_switch) {
48450 +                               add_domain_switch(s->eip);
48451 +                               domain_switch = 0;
48452 +                       } else {
48453 +                               if (state >= sb_bt_start &&
48454 +                                   !add_sample(mm, s, cpu_mode)) {
48455 +                                       if (state == sb_bt_start) {
48456 +                                               state = sb_bt_ignore;
48457 +                                               atomic_inc(&oprofile_stats.bt_lost_no_mapping);
48458 +                                       }
48459                                 }
48460                         }
48461                 }
48462 diff -ruNp linux-2.6.19/drivers/oprofile/cpu_buffer.c linux-2.6.19-xen-3.0.4/drivers/oprofile/cpu_buffer.c
48463 --- linux-2.6.19/drivers/oprofile/cpu_buffer.c  2006-11-29 21:57:37.000000000 +0000
48464 +++ linux-2.6.19-xen-3.0.4/drivers/oprofile/cpu_buffer.c        2007-02-02 19:10:36.000000000 +0000
48465 @@ -6,6 +6,10 @@
48466   *
48467   * @author John Levon <levon@movementarian.org>
48468   *
48469 + * Modified by Aravind Menon for Xen
48470 + * These modifications are:
48471 + * Copyright (C) 2005 Hewlett-Packard Co.
48472 + *
48473   * Each CPU has a local buffer that stores PC value/event
48474   * pairs. We also log context switches when we notice them.
48475   * Eventually each CPU's buffer is processed into the global
48476 @@ -34,6 +38,8 @@ static void wq_sync_buffer(void *);
48477  #define DEFAULT_TIMER_EXPIRE (HZ / 10)
48478  static int work_enabled;
48479  
48480 +static int32_t current_domain = COORDINATOR_DOMAIN;
48481 +
48482  void free_cpu_buffers(void)
48483  {
48484         int i;
48485 @@ -57,7 +63,7 @@ int alloc_cpu_buffers(void)
48486                         goto fail;
48487   
48488                 b->last_task = NULL;
48489 -               b->last_is_kernel = -1;
48490 +               b->last_cpu_mode = -1;
48491                 b->tracing = 0;
48492                 b->buffer_size = buffer_size;
48493                 b->tail_pos = 0;
48494 @@ -113,7 +119,7 @@ void cpu_buffer_reset(struct oprofile_cp
48495          * collected will populate the buffer with proper
48496          * values to initialize the buffer
48497          */
48498 -       cpu_buf->last_is_kernel = -1;
48499 +       cpu_buf->last_cpu_mode = -1;
48500         cpu_buf->last_task = NULL;
48501  }
48502  
48503 @@ -163,13 +169,13 @@ add_code(struct oprofile_cpu_buffer * bu
48504   * because of the head/tail separation of the writer and reader
48505   * of the CPU buffer.
48506   *
48507 - * is_kernel is needed because on some architectures you cannot
48508 + * cpu_mode is needed because on some architectures you cannot
48509   * tell if you are in kernel or user space simply by looking at
48510 - * pc. We tag this in the buffer by generating kernel enter/exit
48511 - * events whenever is_kernel changes
48512 + * pc. We tag this in the buffer by generating kernel/user (and xen)
48513 + *  enter events whenever cpu_mode changes
48514   */
48515  static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
48516 -                     int is_kernel, unsigned long event)
48517 +                     int cpu_mode, unsigned long event)
48518  {
48519         struct task_struct * task;
48520  
48521 @@ -180,18 +186,20 @@ static int log_sample(struct oprofile_cp
48522                 return 0;
48523         }
48524  
48525 -       is_kernel = !!is_kernel;
48526 +       WARN_ON(cpu_mode > CPU_MODE_XEN);
48527  
48528         task = current;
48529  
48530         /* notice a switch from user->kernel or vice versa */
48531 -       if (cpu_buf->last_is_kernel != is_kernel) {
48532 -               cpu_buf->last_is_kernel = is_kernel;
48533 -               add_code(cpu_buf, is_kernel);
48534 +       if (cpu_buf->last_cpu_mode != cpu_mode) {
48535 +               cpu_buf->last_cpu_mode = cpu_mode;
48536 +               add_code(cpu_buf, cpu_mode);
48537         }
48538 -
48539 +       
48540         /* notice a task switch */
48541 -       if (cpu_buf->last_task != task) {
48542 +       /* if not processing other domain samples */
48543 +       if ((cpu_buf->last_task != task) &&
48544 +           (current_domain == COORDINATOR_DOMAIN)) {
48545                 cpu_buf->last_task = task;
48546                 add_code(cpu_buf, (unsigned long)task);
48547         }
48548 @@ -275,6 +283,25 @@ void oprofile_add_trace(unsigned long pc
48549         add_sample(cpu_buf, pc, 0);
48550  }
48551  
48552 +int oprofile_add_domain_switch(int32_t domain_id)
48553 +{
48554 +       struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
48555 +
48556 +       /* should have space for switching into and out of domain 
48557 +          (2 slots each) plus one sample and one cpu mode switch */
48558 +       if (((nr_available_slots(cpu_buf) < 6) && 
48559 +            (domain_id != COORDINATOR_DOMAIN)) ||
48560 +           (nr_available_slots(cpu_buf) < 2))
48561 +               return 0;
48562 +
48563 +       add_code(cpu_buf, CPU_DOMAIN_SWITCH);
48564 +       add_sample(cpu_buf, domain_id, 0);
48565 +
48566 +       current_domain = domain_id;
48567 +
48568 +       return 1;
48569 +}
48570 +
48571  /*
48572   * This serves to avoid cpu buffer overflow, and makes sure
48573   * the task mortuary progresses
48574 diff -ruNp linux-2.6.19/drivers/oprofile/cpu_buffer.h linux-2.6.19-xen-3.0.4/drivers/oprofile/cpu_buffer.h
48575 --- linux-2.6.19/drivers/oprofile/cpu_buffer.h  2006-11-29 21:57:37.000000000 +0000
48576 +++ linux-2.6.19-xen-3.0.4/drivers/oprofile/cpu_buffer.h        2007-02-02 19:10:36.000000000 +0000
48577 @@ -36,7 +36,7 @@ struct oprofile_cpu_buffer {
48578         volatile unsigned long tail_pos;
48579         unsigned long buffer_size;
48580         struct task_struct * last_task;
48581 -       int last_is_kernel;
48582 +       int last_cpu_mode;
48583         int tracing;
48584         struct op_sample * buffer;
48585         unsigned long sample_received;
48586 @@ -51,7 +51,10 @@ extern struct oprofile_cpu_buffer cpu_bu
48587  void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf);
48588  
48589  /* transient events for the CPU buffer -> event buffer */
48590 -#define CPU_IS_KERNEL 1
48591 -#define CPU_TRACE_BEGIN 2
48592 +#define CPU_MODE_USER           0
48593 +#define CPU_MODE_KERNEL         1
48594 +#define CPU_MODE_XEN            2
48595 +#define CPU_TRACE_BEGIN         3
48596 +#define CPU_DOMAIN_SWITCH       4
48597  
48598  #endif /* OPROFILE_CPU_BUFFER_H */
48599 diff -ruNp linux-2.6.19/drivers/oprofile/event_buffer.h linux-2.6.19-xen-3.0.4/drivers/oprofile/event_buffer.h
48600 --- linux-2.6.19/drivers/oprofile/event_buffer.h        2006-11-29 21:57:37.000000000 +0000
48601 +++ linux-2.6.19-xen-3.0.4/drivers/oprofile/event_buffer.h      2007-02-02 19:10:36.000000000 +0000
48602 @@ -29,15 +29,20 @@ void wake_up_buffer_waiter(void);
48603  #define CPU_SWITCH_CODE                2
48604  #define COOKIE_SWITCH_CODE             3
48605  #define KERNEL_ENTER_SWITCH_CODE       4
48606 -#define KERNEL_EXIT_SWITCH_CODE                5
48607 +#define USER_ENTER_SWITCH_CODE         5
48608  #define MODULE_LOADED_CODE             6
48609  #define CTX_TGID_CODE                  7
48610  #define TRACE_BEGIN_CODE               8
48611  #define TRACE_END_CODE                 9
48612 +#define XEN_ENTER_SWITCH_CODE          10
48613 +#define DOMAIN_SWITCH_CODE             11
48614   
48615  #define INVALID_COOKIE ~0UL
48616  #define NO_COOKIE 0UL
48617  
48618 +/* Constant used to refer to coordinator domain (Xen) */
48619 +#define COORDINATOR_DOMAIN -1
48620 +
48621  /* add data to the event buffer */
48622  void add_event_entry(unsigned long data);
48623   
48624 diff -ruNp linux-2.6.19/drivers/oprofile/oprof.c linux-2.6.19-xen-3.0.4/drivers/oprofile/oprof.c
48625 --- linux-2.6.19/drivers/oprofile/oprof.c       2006-11-29 21:57:37.000000000 +0000
48626 +++ linux-2.6.19-xen-3.0.4/drivers/oprofile/oprof.c     2007-02-02 19:10:36.000000000 +0000
48627 @@ -5,6 +5,10 @@
48628   * @remark Read the file COPYING
48629   *
48630   * @author John Levon <levon@movementarian.org>
48631 + *
48632 + * Modified by Aravind Menon for Xen
48633 + * These modifications are:
48634 + * Copyright (C) 2005 Hewlett-Packard Co.
48635   */
48636  
48637  #include <linux/kernel.h>
48638 @@ -19,7 +23,7 @@
48639  #include "cpu_buffer.h"
48640  #include "buffer_sync.h"
48641  #include "oprofile_stats.h"
48642
48643 +
48644  struct oprofile_operations oprofile_ops;
48645  
48646  unsigned long oprofile_started;
48647 @@ -33,6 +37,32 @@ static DEFINE_MUTEX(start_mutex);
48648   */
48649  static int timer = 0;
48650  
48651 +int oprofile_set_active(int active_domains[], unsigned int adomains)
48652 +{
48653 +       int err;
48654 +
48655 +       if (!oprofile_ops.set_active)
48656 +               return -EINVAL;
48657 +
48658 +       mutex_lock(&start_mutex);
48659 +       err = oprofile_ops.set_active(active_domains, adomains);
48660 +       mutex_unlock(&start_mutex);
48661 +       return err;
48662 +}
48663 +
48664 +int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
48665 +{
48666 +       int err;
48667 +
48668 +       if (!oprofile_ops.set_passive)
48669 +               return -EINVAL;
48670 +
48671 +       mutex_lock(&start_mutex);
48672 +       err = oprofile_ops.set_passive(passive_domains, pdomains);
48673 +       mutex_unlock(&start_mutex);
48674 +       return err;
48675 +}
48676 +
48677  int oprofile_setup(void)
48678  {
48679         int err;
48680 diff -ruNp linux-2.6.19/drivers/oprofile/oprof.h linux-2.6.19-xen-3.0.4/drivers/oprofile/oprof.h
48681 --- linux-2.6.19/drivers/oprofile/oprof.h       2006-11-29 21:57:37.000000000 +0000
48682 +++ linux-2.6.19-xen-3.0.4/drivers/oprofile/oprof.h     2007-02-02 19:10:36.000000000 +0000
48683 @@ -35,5 +35,8 @@ void oprofile_create_files(struct super_
48684  void oprofile_timer_init(struct oprofile_operations * ops);
48685  
48686  int oprofile_set_backtrace(unsigned long depth);
48687 +
48688 +int oprofile_set_active(int active_domains[], unsigned int adomains);
48689 +int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
48690   
48691  #endif /* OPROF_H */
48692 diff -ruNp linux-2.6.19/drivers/oprofile/oprofile_files.c linux-2.6.19-xen-3.0.4/drivers/oprofile/oprofile_files.c
48693 --- linux-2.6.19/drivers/oprofile/oprofile_files.c      2006-11-29 21:57:37.000000000 +0000
48694 +++ linux-2.6.19-xen-3.0.4/drivers/oprofile/oprofile_files.c    2007-02-02 19:10:36.000000000 +0000
48695 @@ -5,15 +5,21 @@
48696   * @remark Read the file COPYING
48697   *
48698   * @author John Levon <levon@movementarian.org>
48699 + *
48700 + * Modified by Aravind Menon for Xen
48701 + * These modifications are:
48702 + * Copyright (C) 2005 Hewlett-Packard Co.      
48703   */
48704  
48705  #include <linux/fs.h>
48706  #include <linux/oprofile.h>
48707 +#include <asm/uaccess.h>
48708 +#include <linux/ctype.h>
48709  
48710  #include "event_buffer.h"
48711  #include "oprofile_stats.h"
48712  #include "oprof.h"
48713
48714 +
48715  unsigned long fs_buffer_size = 131072;
48716  unsigned long fs_cpu_buffer_size = 8192;
48717  unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */
48718 @@ -117,11 +123,208 @@ static ssize_t dump_write(struct file * 
48719  static struct file_operations dump_fops = {
48720         .write          = dump_write,
48721  };
48722
48723 +
48724 +#ifdef CONFIG_XEN
48725 +
48726 +#define TMPBUFSIZE 512
48727 +
48728 +static unsigned int adomains = 0;
48729 +static int active_domains[MAX_OPROF_DOMAINS + 1];
48730 +static DEFINE_MUTEX(adom_mutex);
48731 +
48732 +static ssize_t adomain_write(struct file * file, char const __user * buf, 
48733 +                            size_t count, loff_t * offset)
48734 +{
48735 +       char *tmpbuf;
48736 +       char *startp, *endp;
48737 +       int i;
48738 +       unsigned long val;
48739 +       ssize_t retval = count;
48740 +       
48741 +       if (*offset)
48742 +               return -EINVAL; 
48743 +       if (count > TMPBUFSIZE - 1)
48744 +               return -EINVAL;
48745 +
48746 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
48747 +               return -ENOMEM;
48748 +
48749 +       if (copy_from_user(tmpbuf, buf, count)) {
48750 +               kfree(tmpbuf);
48751 +               return -EFAULT;
48752 +       }
48753 +       tmpbuf[count] = 0;
48754 +
48755 +       mutex_lock(&adom_mutex);
48756 +
48757 +       startp = tmpbuf;
48758 +       /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
48759 +       for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
48760 +               val = simple_strtoul(startp, &endp, 0);
48761 +               if (endp == startp)
48762 +                       break;
48763 +               while (ispunct(*endp) || isspace(*endp))
48764 +                       endp++;
48765 +               active_domains[i] = val;
48766 +               if (active_domains[i] != val)
48767 +                       /* Overflow, force error below */
48768 +                       i = MAX_OPROF_DOMAINS + 1;
48769 +               startp = endp;
48770 +       }
48771 +       /* Force error on trailing junk */
48772 +       adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
48773 +
48774 +       kfree(tmpbuf);
48775 +
48776 +       if (adomains > MAX_OPROF_DOMAINS
48777 +           || oprofile_set_active(active_domains, adomains)) {
48778 +               adomains = 0;
48779 +               retval = -EINVAL;
48780 +       }
48781 +
48782 +       mutex_unlock(&adom_mutex);
48783 +       return retval;
48784 +}
48785 +
48786 +static ssize_t adomain_read(struct file * file, char __user * buf, 
48787 +                           size_t count, loff_t * offset)
48788 +{
48789 +       char * tmpbuf;
48790 +       size_t len;
48791 +       int i;
48792 +       ssize_t retval;
48793 +
48794 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
48795 +               return -ENOMEM;
48796 +
48797 +       mutex_lock(&adom_mutex);
48798 +
48799 +       len = 0;
48800 +       for (i = 0; i < adomains; i++)
48801 +               len += snprintf(tmpbuf + len,
48802 +                               len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
48803 +                               "%u ", active_domains[i]);
48804 +       WARN_ON(len > TMPBUFSIZE);
48805 +       if (len != 0 && len <= TMPBUFSIZE)
48806 +               tmpbuf[len-1] = '\n';
48807 +
48808 +       mutex_unlock(&adom_mutex);
48809 +
48810 +       retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
48811 +
48812 +       kfree(tmpbuf);
48813 +       return retval;
48814 +}
48815 +
48816 +
48817 +static struct file_operations active_domain_ops = {
48818 +       .read           = adomain_read,
48819 +       .write          = adomain_write,
48820 +};
48821 +
48822 +static unsigned int pdomains = 0;
48823 +static int passive_domains[MAX_OPROF_DOMAINS];
48824 +static DEFINE_MUTEX(pdom_mutex);
48825 +
48826 +static ssize_t pdomain_write(struct file * file, char const __user * buf, 
48827 +                            size_t count, loff_t * offset)
48828 +{
48829 +       char *tmpbuf;
48830 +       char *startp, *endp;
48831 +       int i;
48832 +       unsigned long val;
48833 +       ssize_t retval = count;
48834 +       
48835 +       if (*offset)
48836 +               return -EINVAL; 
48837 +       if (count > TMPBUFSIZE - 1)
48838 +               return -EINVAL;
48839 +
48840 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
48841 +               return -ENOMEM;
48842 +
48843 +       if (copy_from_user(tmpbuf, buf, count)) {
48844 +               kfree(tmpbuf);
48845 +               return -EFAULT;
48846 +       }
48847 +       tmpbuf[count] = 0;
48848 +
48849 +       mutex_lock(&pdom_mutex);
48850 +
48851 +       startp = tmpbuf;
48852 +       /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
48853 +       for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
48854 +               val = simple_strtoul(startp, &endp, 0);
48855 +               if (endp == startp)
48856 +                       break;
48857 +               while (ispunct(*endp) || isspace(*endp))
48858 +                       endp++;
48859 +               passive_domains[i] = val;
48860 +               if (passive_domains[i] != val)
48861 +                       /* Overflow, force error below */
48862 +                       i = MAX_OPROF_DOMAINS + 1;
48863 +               startp = endp;
48864 +       }
48865 +       /* Force error on trailing junk */
48866 +       pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
48867 +
48868 +       kfree(tmpbuf);
48869 +
48870 +       if (pdomains > MAX_OPROF_DOMAINS
48871 +           || oprofile_set_passive(passive_domains, pdomains)) {
48872 +               pdomains = 0;
48873 +               retval = -EINVAL;
48874 +       }
48875 +
48876 +       mutex_unlock(&pdom_mutex);
48877 +       return retval;
48878 +}
48879 +
48880 +static ssize_t pdomain_read(struct file * file, char __user * buf, 
48881 +                           size_t count, loff_t * offset)
48882 +{
48883 +       char * tmpbuf;
48884 +       size_t len;
48885 +       int i;
48886 +       ssize_t retval;
48887 +
48888 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
48889 +               return -ENOMEM;
48890 +
48891 +       mutex_lock(&pdom_mutex);
48892 +
48893 +       len = 0;
48894 +       for (i = 0; i < pdomains; i++)
48895 +               len += snprintf(tmpbuf + len,
48896 +                               len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
48897 +                               "%u ", passive_domains[i]);
48898 +       WARN_ON(len > TMPBUFSIZE);
48899 +       if (len != 0 && len <= TMPBUFSIZE)
48900 +               tmpbuf[len-1] = '\n';
48901 +
48902 +       mutex_unlock(&pdom_mutex);
48903 +
48904 +       retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
48905 +
48906 +       kfree(tmpbuf);
48907 +       return retval;
48908 +}
48909 +
48910 +static struct file_operations passive_domain_ops = {
48911 +       .read           = pdomain_read,
48912 +       .write          = pdomain_write,
48913 +};
48914 +
48915 +#endif /* CONFIG_XEN */
48916 +
48917  void oprofile_create_files(struct super_block * sb, struct dentry * root)
48918  {
48919         oprofilefs_create_file(sb, root, "enable", &enable_fops);
48920         oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
48921 +#ifdef CONFIG_XEN
48922 +       oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
48923 +       oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
48924 +#endif
48925         oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
48926         oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size);
48927         oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
48928 diff -ruNp linux-2.6.19/drivers/pci/Kconfig linux-2.6.19-xen-3.0.4/drivers/pci/Kconfig
48929 --- linux-2.6.19/drivers/pci/Kconfig    2006-11-29 21:57:37.000000000 +0000
48930 +++ linux-2.6.19-xen-3.0.4/drivers/pci/Kconfig  2007-02-02 19:10:36.000000000 +0000
48931 @@ -5,6 +5,7 @@ config PCI_MSI
48932         bool "Message Signaled Interrupts (MSI and MSI-X)"
48933         depends on PCI
48934         depends on (X86_LOCAL_APIC && X86_IO_APIC) || IA64
48935 +       depends on !XEN
48936         help
48937            This allows device drivers to enable MSI (Message Signaled
48938            Interrupts).  Message Signaled Interrupts enable a device to
48939 @@ -55,7 +56,7 @@ config PCI_DEBUG
48940  config HT_IRQ
48941         bool "Interrupts on hypertransport devices"
48942         default y
48943 -       depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
48944 +       depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
48945         help
48946            This allows native hypertransport devices to use interrupts.
48947  
48948 diff -ruNp linux-2.6.19/drivers/serial/Kconfig linux-2.6.19-xen-3.0.4/drivers/serial/Kconfig
48949 --- linux-2.6.19/drivers/serial/Kconfig 2006-11-29 21:57:37.000000000 +0000
48950 +++ linux-2.6.19-xen-3.0.4/drivers/serial/Kconfig       2007-02-02 19:10:41.000000000 +0000
48951 @@ -11,6 +11,7 @@ menu "Serial drivers"
48952  config SERIAL_8250
48953         tristate "8250/16550 and compatible serial support"
48954         depends on (BROKEN || !SPARC)
48955 +       depends on !XEN_DISABLE_SERIAL
48956         select SERIAL_CORE
48957         ---help---
48958           This selects whether you want to include the driver for the standard
48959 diff -ruNp linux-2.6.19/drivers/xen/Kconfig linux-2.6.19-xen-3.0.4/drivers/xen/Kconfig
48960 --- linux-2.6.19/drivers/xen/Kconfig    1970-01-01 00:00:00.000000000 +0000
48961 +++ linux-2.6.19-xen-3.0.4/drivers/xen/Kconfig  2007-02-02 19:10:45.000000000 +0000
48962 @@ -0,0 +1,283 @@
48963 +#
48964 +# This Kconfig describe xen options
48965 +#
48966 +
48967 +mainmenu "Xen Configuration"
48968 +
48969 +config XEN
48970 +       bool
48971 +       default y if X86_XEN || X86_64_XEN
48972 +       help
48973 +         This is the Linux Xen port.
48974 +
48975 +if XEN
48976 +config XEN_INTERFACE_VERSION
48977 +       hex
48978 +       default 0x00030203
48979 +
48980 +menu "XEN"
48981 +
48982 +config XEN_PRIVILEGED_GUEST
48983 +       bool "Privileged Guest (domain 0)"
48984 +       depends XEN
48985 +       default n
48986 +       help
48987 +         Support for privileged operation (domain 0)
48988 +
48989 +config XEN_UNPRIVILEGED_GUEST
48990 +       bool
48991 +       default !XEN_PRIVILEGED_GUEST
48992 +
48993 +config XEN_PRIVCMD
48994 +       bool
48995 +       depends on PROC_FS
48996 +       default y
48997 +
48998 +config XEN_XENBUS_DEV
48999 +       bool
49000 +       depends on PROC_FS
49001 +       default y
49002 +
49003 +config XEN_BACKEND
49004 +        tristate "Backend driver support"
49005 +        default y
49006 +        help
49007 +          Support for backend device drivers that provide I/O services
49008 +          to other virtual machines.
49009 +
49010 +config XEN_BLKDEV_BACKEND
49011 +       tristate "Block-device backend driver"
49012 +        depends on XEN_BACKEND
49013 +       default y
49014 +       help
49015 +         The block-device backend driver allows the kernel to export its
49016 +         block devices to other guests via a high-performance shared-memory
49017 +         interface.
49018 +
49019 +config XEN_BLKDEV_TAP
49020 +       tristate "Block-device tap backend driver"
49021 +       depends on XEN_BACKEND
49022 +       default XEN_PRIVILEGED_GUEST
49023 +       help
49024 +         The block tap driver is an alternative to the block back driver 
49025 +          and allows VM block requests to be redirected to userspace through
49026 +          a device interface.  The tap allows user-space development of 
49027 +          high-performance block backends, where disk images may be implemented
49028 +          as files, in memory, or on other hosts across the network.  This 
49029 +         driver can safely coexist with the existing blockback driver.
49030 +
49031 +config XEN_NETDEV_BACKEND
49032 +       tristate "Network-device backend driver"
49033 +        depends on XEN_BACKEND && NET
49034 +       default y
49035 +       help
49036 +         The network-device backend driver allows the kernel to export its
49037 +         network devices to other guests via a high-performance shared-memory
49038 +         interface.
49039 +
49040 +config XEN_NETDEV_PIPELINED_TRANSMITTER
49041 +       bool "Pipelined transmitter (DANGEROUS)"
49042 +       depends on XEN_NETDEV_BACKEND
49043 +       default n
49044 +       help
49045 +         If the net backend is a dumb domain, such as a transparent Ethernet
49046 +         bridge with no local IP interface, it is safe to say Y here to get
49047 +         slightly lower network overhead.
49048 +         If the backend has a local IP interface; or may be doing smart things
49049 +         like reassembling packets to perform firewall filtering; or if you
49050 +         are unsure; or if you experience network hangs when this option is
49051 +         enabled; then you must say N here.
49052 +
49053 +config XEN_NETDEV_LOOPBACK
49054 +       tristate "Network-device loopback driver"
49055 +       depends on XEN_NETDEV_BACKEND
49056 +       default y
49057 +       help
49058 +         A two-interface loopback device to emulate a local netfront-netback
49059 +         connection.
49060 +
49061 +config XEN_PCIDEV_BACKEND
49062 +       tristate "PCI-device backend driver"
49063 +       depends on PCI && XEN_BACKEND
49064 +       default XEN_PRIVILEGED_GUEST
49065 +       help
49066 +         The PCI device backend driver allows the kernel to export arbitrary
49067 +         PCI devices to other guests. If you select this to be a module, you
49068 +         will need to make sure no other driver has bound to the device(s)
49069 +         you want to make visible to other guests.
49070 +
49071 +choice
49072 +       prompt "PCI Backend Mode"
49073 +       depends on XEN_PCIDEV_BACKEND
49074 +       default XEN_PCIDEV_BACKEND_VPCI
49075 +
49076 +config XEN_PCIDEV_BACKEND_VPCI
49077 +       bool "Virtual PCI"
49078 +       ---help---
49079 +         This PCI Backend hides the true PCI topology and makes the frontend
49080 +         think there is a single PCI bus with only the exported devices on it.
49081 +         For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
49082 +         second device at 02:1a.1 will be re-assigned to 00:01.1.
49083 +
49084 +config XEN_PCIDEV_BACKEND_PASS
49085 +       bool "Passthrough"
49086 +       ---help---
49087 +         This PCI Backend provides a real view of the PCI topology to the
49088 +         frontend (for example, a device at 06:01.b will still appear at
49089 +         06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
49090 +         PCI devices to its driver domains. This may be required for drivers
49091 +         which depend on finding their hardward in certain bus/slot
49092 +         locations.
49093 +
49094 +config XEN_PCIDEV_BACKEND_SLOT
49095 +       bool "Slot"
49096 +       ---help---
49097 +         This PCI Backend hides the true PCI topology and makes the frontend
49098 +         think there is a single PCI bus with only the exported devices on it.
49099 +         Contrary to the virtual PCI backend, a function becomes a new slot.
49100 +         For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
49101 +         second device at 02:1a.1 will be re-assigned to 00:01.0.
49102 +
49103 +endchoice
49104 +
49105 +config XEN_PCIDEV_BE_DEBUG
49106 +       bool "PCI Backend Debugging"
49107 +       depends on XEN_PCIDEV_BACKEND
49108 +       default n
49109 +
49110 +config XEN_TPMDEV_BACKEND
49111 +       tristate "TPM-device backend driver"
49112 +        depends on XEN_BACKEND
49113 +       default n
49114 +       help
49115 +         The TPM-device backend driver
49116 +
49117 +config XEN_BLKDEV_FRONTEND
49118 +       tristate "Block-device frontend driver"
49119 +       depends on XEN
49120 +       default y
49121 +       help
49122 +         The block-device frontend driver allows the kernel to access block
49123 +         devices mounted within another guest OS. Unless you are building a
49124 +         dedicated device-driver domain, or your master control domain
49125 +         (domain 0), then you almost certainly want to say Y here.
49126 +
49127 +config XEN_NETDEV_FRONTEND
49128 +       tristate "Network-device frontend driver"
49129 +       depends on XEN && NET
49130 +       default y
49131 +       help
49132 +         The network-device frontend driver allows the kernel to access
49133 +         network interfaces within another guest OS. Unless you are building a
49134 +         dedicated device-driver domain, or your master control domain
49135 +         (domain 0), then you almost certainly want to say Y here.
49136 +
49137 +config XEN_FRAMEBUFFER
49138 +       tristate "Framebuffer-device frontend driver"
49139 +       depends on XEN && FB
49140 +       select FB_CFB_FILLRECT
49141 +       select FB_CFB_COPYAREA
49142 +       select FB_CFB_IMAGEBLIT
49143 +       default y
49144 +       help
49145 +         The framebuffer-device frontend drivers allows the kernel to create a
49146 +         virtual framebuffer.  This framebuffer can be viewed in another
49147 +         domain.  Unless this domain has access to a real video card, you
49148 +         probably want to say Y here.
49149 +
49150 +config XEN_KEYBOARD
49151 +       tristate "Keyboard-device frontend driver"
49152 +       depends on XEN && XEN_FRAMEBUFFER && INPUT
49153 +       default y
49154 +       help
49155 +         The keyboard-device frontend driver allows the kernel to create a
49156 +         virtual keyboard.  This keyboard can then be driven by another
49157 +         domain.  If you've said Y to CONFIG_XEN_FRAMEBUFFER, you probably
49158 +         want to say Y here.
49159 +
49160 +config XEN_SCRUB_PAGES
49161 +       bool "Scrub memory before freeing it to Xen"
49162 +       default y
49163 +       help
49164 +         Erase memory contents before freeing it back to Xen's global
49165 +         pool. This ensures that any secrets contained within that
49166 +         memory (e.g., private keys) cannot be found by other guests that
49167 +         may be running on the machine. Most people will want to say Y here.
49168 +         If security is not a concern then you may increase performance by
49169 +         saying N.
49170 +
49171 +config XEN_DISABLE_SERIAL
49172 +       bool "Disable serial port drivers"
49173 +       default y
49174 +       help
49175 +         Disable serial port drivers, allowing the Xen console driver
49176 +         to provide a serial console at ttyS0.
49177 +
49178 +config XEN_SYSFS
49179 +       tristate "Export Xen attributes in sysfs"
49180 +       depends on SYSFS
49181 +       default y
49182 +       help
49183 +         Xen hypervisor attributes will show up under /sys/hypervisor/.
49184 +
49185 +choice
49186 +       prompt "Xen version compatibility"
49187 +       default XEN_COMPAT_030002_AND_LATER
49188 +
49189 +       config XEN_COMPAT_030002_AND_LATER
49190 +               bool "3.0.2 and later"
49191 +
49192 +       config XEN_COMPAT_LATEST_ONLY
49193 +               bool "no compatibility code"
49194 +
49195 +endchoice
49196 +
49197 +config XEN_COMPAT_030002
49198 +       bool
49199 +       default XEN_COMPAT_030002_AND_LATER
49200 +
49201 +endmenu
49202 +
49203 +config HAVE_ARCH_ALLOC_SKB
49204 +       bool
49205 +       default y
49206 +
49207 +config HAVE_ARCH_DEV_ALLOC_SKB
49208 +       bool
49209 +       default y
49210 +
49211 +config HAVE_IRQ_IGNORE_UNHANDLED
49212 +       bool
49213 +       default y
49214 +
49215 +config NO_IDLE_HZ
49216 +       bool
49217 +       default y
49218 +
49219 +config XEN_UTIL
49220 +       bool
49221 +       default y
49222 +
49223 +config XEN_BALLOON
49224 +       bool
49225 +       default y
49226 +
49227 +config XEN_DEVMEM
49228 +       bool
49229 +       default y
49230 +
49231 +config XEN_SKBUFF
49232 +       bool
49233 +       default y
49234 +       depends on NET
49235 +
49236 +config XEN_REBOOT
49237 +       bool
49238 +       default y
49239 +
49240 +config XEN_SMPBOOT
49241 +       bool
49242 +       default y
49243 +       depends on SMP
49244 +
49245 +endif
49246 diff -ruNp linux-2.6.19/drivers/xen/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/Makefile
49247 --- linux-2.6.19/drivers/xen/Makefile   1970-01-01 00:00:00.000000000 +0000
49248 +++ linux-2.6.19-xen-3.0.4/drivers/xen/Makefile 2007-02-02 19:10:45.000000000 +0000
49249 @@ -0,0 +1,19 @@
49250 +obj-y  += core/
49251 +obj-y  += console/
49252 +obj-y  += evtchn/
49253 +obj-y  += privcmd/
49254 +obj-y  += xenbus/
49255 +
49256 +obj-$(CONFIG_XEN_UTIL)                 += util.o
49257 +obj-$(CONFIG_XEN_BALLOON)              += balloon/
49258 +obj-$(CONFIG_XEN_DEVMEM)               += char/
49259 +obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += blkback/
49260 +obj-$(CONFIG_XEN_BLKDEV_TAP)           += blktap/
49261 +obj-$(CONFIG_XEN_NETDEV_BACKEND)       += netback/
49262 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmback/
49263 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += blkfront/
49264 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      += netfront/
49265 +obj-$(CONFIG_XEN_PCIDEV_BACKEND)       += pciback/
49266 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront/
49267 +obj-$(CONFIG_XEN_FRAMEBUFFER)          += fbfront/
49268 +obj-$(CONFIG_XEN_KEYBOARD)             += fbfront/
49269 diff -ruNp linux-2.6.19/drivers/xen/balloon/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/balloon/Makefile
49270 --- linux-2.6.19/drivers/xen/balloon/Makefile   1970-01-01 00:00:00.000000000 +0000
49271 +++ linux-2.6.19-xen-3.0.4/drivers/xen/balloon/Makefile 2007-02-02 19:10:45.000000000 +0000
49272 @@ -0,0 +1,2 @@
49273 +
49274 +obj-y := balloon.o sysfs.o
49275 diff -ruNp linux-2.6.19/drivers/xen/balloon/balloon.c linux-2.6.19-xen-3.0.4/drivers/xen/balloon/balloon.c
49276 --- linux-2.6.19/drivers/xen/balloon/balloon.c  1970-01-01 00:00:00.000000000 +0000
49277 +++ linux-2.6.19-xen-3.0.4/drivers/xen/balloon/balloon.c        2007-02-02 19:10:45.000000000 +0000
49278 @@ -0,0 +1,624 @@
49279 +/******************************************************************************
49280 + * balloon.c
49281 + *
49282 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
49283 + *
49284 + * Copyright (c) 2003, B Dragovic
49285 + * Copyright (c) 2003-2004, M Williamson, K Fraser
49286 + * Copyright (c) 2005 Dan M. Smith, IBM Corporation
49287 + * 
49288 + * This program is free software; you can redistribute it and/or
49289 + * modify it under the terms of the GNU General Public License version 2
49290 + * as published by the Free Software Foundation; or, when distributed
49291 + * separately from the Linux kernel or incorporated into other
49292 + * software packages, subject to the following license:
49293 + * 
49294 + * Permission is hereby granted, free of charge, to any person obtaining a copy
49295 + * of this source file (the "Software"), to deal in the Software without
49296 + * restriction, including without limitation the rights to use, copy, modify,
49297 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
49298 + * and to permit persons to whom the Software is furnished to do so, subject to
49299 + * the following conditions:
49300 + * 
49301 + * The above copyright notice and this permission notice shall be included in
49302 + * all copies or substantial portions of the Software.
49303 + * 
49304 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49305 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49306 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49307 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49308 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
49309 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
49310 + * IN THE SOFTWARE.
49311 + */
49312 +
49313 +#include <linux/kernel.h>
49314 +#include <linux/module.h>
49315 +#include <linux/sched.h>
49316 +#include <linux/errno.h>
49317 +#include <linux/mm.h>
49318 +#include <linux/mman.h>
49319 +#include <linux/smp_lock.h>
49320 +#include <linux/pagemap.h>
49321 +#include <linux/bootmem.h>
49322 +#include <linux/highmem.h>
49323 +#include <linux/vmalloc.h>
49324 +#include <xen/xen_proc.h>
49325 +#include <asm/hypervisor.h>
49326 +#include <xen/balloon.h>
49327 +#include <xen/interface/memory.h>
49328 +#include <asm/pgalloc.h>
49329 +#include <asm/pgtable.h>
49330 +#include <asm/uaccess.h>
49331 +#include <asm/tlb.h>
49332 +#include <linux/list.h>
49333 +#include <xen/xenbus.h>
49334 +#include "common.h"
49335 +
49336 +#ifdef CONFIG_PROC_FS
49337 +static struct proc_dir_entry *balloon_pde;
49338 +#endif
49339 +
49340 +static DECLARE_MUTEX(balloon_mutex);
49341 +
49342 +/*
49343 + * Protects atomic reservation decrease/increase against concurrent increases.
49344 + * Also protects non-atomic updates of current_pages and driver_pages, and
49345 + * balloon lists.
49346 + */
49347 +DEFINE_SPINLOCK(balloon_lock);
49348 +
49349 +struct balloon_stats balloon_stats;
49350 +
49351 +/* We increase/decrease in batches which fit in a page */
49352 +static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
49353 +
49354 +/* VM /proc information for memory */
49355 +extern unsigned long totalram_pages;
49356 +
49357 +/* List of ballooned pages, threaded through the mem_map array. */
49358 +static LIST_HEAD(ballooned_pages);
49359 +
49360 +/* Main work function, always executed in process context. */
49361 +static void balloon_process(void *unused);
49362 +static DECLARE_WORK(balloon_worker, balloon_process, NULL);
49363 +static struct timer_list balloon_timer;
49364 +
49365 +/* When ballooning out (allocating memory to return to Xen) we don't really 
49366 +   want the kernel to try too hard since that can trigger the oom killer. */
49367 +#define GFP_BALLOON \
49368 +       (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
49369 +
49370 +#define PAGE_TO_LIST(p) (&(p)->lru)
49371 +#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
49372 +#define UNLIST_PAGE(p)                         \
49373 +       do {                                    \
49374 +               list_del(PAGE_TO_LIST(p));      \
49375 +               PAGE_TO_LIST(p)->next = NULL;   \
49376 +               PAGE_TO_LIST(p)->prev = NULL;   \
49377 +       } while(0)
49378 +
49379 +#define IPRINTK(fmt, args...) \
49380 +       printk(KERN_INFO "xen_mem: " fmt, ##args)
49381 +#define WPRINTK(fmt, args...) \
49382 +       printk(KERN_WARNING "xen_mem: " fmt, ##args)
49383 +
49384 +/* balloon_append: add the given page to the balloon. */
49385 +static void balloon_append(struct page *page)
49386 +{
49387 +       /* Lowmem is re-populated first, so highmem pages go at list tail. */
49388 +       if (PageHighMem(page)) {
49389 +               list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
49390 +               bs.balloon_high++;
49391 +       } else {
49392 +               list_add(PAGE_TO_LIST(page), &ballooned_pages);
49393 +               bs.balloon_low++;
49394 +       }
49395 +}
49396 +
49397 +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
49398 +static struct page *balloon_retrieve(void)
49399 +{
49400 +       struct page *page;
49401 +
49402 +       if (list_empty(&ballooned_pages))
49403 +               return NULL;
49404 +
49405 +       page = LIST_TO_PAGE(ballooned_pages.next);
49406 +       UNLIST_PAGE(page);
49407 +
49408 +       if (PageHighMem(page))
49409 +               bs.balloon_high--;
49410 +       else
49411 +               bs.balloon_low--;
49412 +
49413 +       return page;
49414 +}
49415 +
49416 +static struct page *balloon_first_page(void)
49417 +{
49418 +       if (list_empty(&ballooned_pages))
49419 +               return NULL;
49420 +       return LIST_TO_PAGE(ballooned_pages.next);
49421 +}
49422 +
49423 +static struct page *balloon_next_page(struct page *page)
49424 +{
49425 +       struct list_head *next = PAGE_TO_LIST(page)->next;
49426 +       if (next == &ballooned_pages)
49427 +               return NULL;
49428 +       return LIST_TO_PAGE(next);
49429 +}
49430 +
49431 +static void balloon_alarm(unsigned long unused)
49432 +{
49433 +       schedule_work(&balloon_worker);
49434 +}
49435 +
49436 +static unsigned long current_target(void)
49437 +{
49438 +       unsigned long target = min(bs.target_pages, bs.hard_limit);
49439 +       if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
49440 +               target = bs.current_pages + bs.balloon_low + bs.balloon_high;
49441 +       return target;
49442 +}
49443 +
49444 +static int increase_reservation(unsigned long nr_pages)
49445 +{
49446 +       unsigned long  pfn, i, flags;
49447 +       struct page   *page;
49448 +       long           rc;
49449 +       struct xen_memory_reservation reservation = {
49450 +               .address_bits = 0,
49451 +               .extent_order = 0,
49452 +               .domid        = DOMID_SELF
49453 +       };
49454 +
49455 +       if (nr_pages > ARRAY_SIZE(frame_list))
49456 +               nr_pages = ARRAY_SIZE(frame_list);
49457 +
49458 +       balloon_lock(flags);
49459 +
49460 +       page = balloon_first_page();
49461 +       for (i = 0; i < nr_pages; i++) {
49462 +               BUG_ON(page == NULL);
49463 +               frame_list[i] = page_to_pfn(page);;
49464 +               page = balloon_next_page(page);
49465 +       }
49466 +
49467 +       set_xen_guest_handle(reservation.extent_start, frame_list);
49468 +       reservation.nr_extents   = nr_pages;
49469 +       rc = HYPERVISOR_memory_op(
49470 +               XENMEM_populate_physmap, &reservation);
49471 +       if (rc < nr_pages) {
49472 +               if (rc > 0) {
49473 +                       int ret;
49474 +
49475 +                       /* We hit the Xen hard limit: reprobe. */
49476 +                       reservation.nr_extents = rc;
49477 +                       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
49478 +                                       &reservation);
49479 +                       BUG_ON(ret != rc);
49480 +               }
49481 +               if (rc >= 0)
49482 +                       bs.hard_limit = (bs.current_pages + rc -
49483 +                                        bs.driver_pages);
49484 +               goto out;
49485 +       }
49486 +
49487 +       for (i = 0; i < nr_pages; i++) {
49488 +               page = balloon_retrieve();
49489 +               BUG_ON(page == NULL);
49490 +
49491 +               pfn = page_to_pfn(page);
49492 +               BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
49493 +                      phys_to_machine_mapping_valid(pfn));
49494 +
49495 +               set_phys_to_machine(pfn, frame_list[i]);
49496 +
49497 +               /* Link back into the page tables if not highmem. */
49498 +               if (pfn < max_low_pfn) {
49499 +                       int ret;
49500 +                       ret = HYPERVISOR_update_va_mapping(
49501 +                               (unsigned long)__va(pfn << PAGE_SHIFT),
49502 +                               pfn_pte_ma(frame_list[i], PAGE_KERNEL),
49503 +                               0);
49504 +                       BUG_ON(ret);
49505 +               }
49506 +
49507 +               /* Relinquish the page back to the allocator. */
49508 +               ClearPageReserved(page);
49509 +               init_page_count(page);
49510 +               __free_page(page);
49511 +       }
49512 +
49513 +       bs.current_pages += nr_pages;
49514 +       totalram_pages = bs.current_pages;
49515 +
49516 + out:
49517 +       balloon_unlock(flags);
49518 +
49519 +       return 0;
49520 +}
49521 +
49522 +static int decrease_reservation(unsigned long nr_pages)
49523 +{
49524 +       unsigned long  pfn, i, flags;
49525 +       struct page   *page;
49526 +       void          *v;
49527 +       int            need_sleep = 0;
49528 +       int ret;
49529 +       struct xen_memory_reservation reservation = {
49530 +               .address_bits = 0,
49531 +               .extent_order = 0,
49532 +               .domid        = DOMID_SELF
49533 +       };
49534 +
49535 +       if (nr_pages > ARRAY_SIZE(frame_list))
49536 +               nr_pages = ARRAY_SIZE(frame_list);
49537 +
49538 +       for (i = 0; i < nr_pages; i++) {
49539 +               if ((page = alloc_page(GFP_BALLOON)) == NULL) {
49540 +                       nr_pages = i;
49541 +                       need_sleep = 1;
49542 +                       break;
49543 +               }
49544 +
49545 +               pfn = page_to_pfn(page);
49546 +               frame_list[i] = pfn_to_mfn(pfn);
49547 +
49548 +               if (!PageHighMem(page)) {
49549 +                       v = phys_to_virt(pfn << PAGE_SHIFT);
49550 +                       scrub_pages(v, 1);
49551 +                       ret = HYPERVISOR_update_va_mapping(
49552 +                               (unsigned long)v, __pte_ma(0), 0);
49553 +                       BUG_ON(ret);
49554 +               }
49555 +#ifdef CONFIG_XEN_SCRUB_PAGES
49556 +               else {
49557 +                       v = kmap(page);
49558 +                       scrub_pages(v, 1);
49559 +                       kunmap(page);
49560 +               }
49561 +#endif
49562 +       }
49563 +
49564 +       /* Ensure that ballooned highmem pages don't have kmaps. */
49565 +       kmap_flush_unused();
49566 +       flush_tlb_all();
49567 +
49568 +       balloon_lock(flags);
49569 +
49570 +       /* No more mappings: invalidate P2M and add to balloon. */
49571 +       for (i = 0; i < nr_pages; i++) {
49572 +               pfn = mfn_to_pfn(frame_list[i]);
49573 +               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
49574 +               balloon_append(pfn_to_page(pfn));
49575 +       }
49576 +
49577 +       set_xen_guest_handle(reservation.extent_start, frame_list);
49578 +       reservation.nr_extents   = nr_pages;
49579 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
49580 +       BUG_ON(ret != nr_pages);
49581 +
49582 +       bs.current_pages -= nr_pages;
49583 +       totalram_pages = bs.current_pages;
49584 +
49585 +       balloon_unlock(flags);
49586 +
49587 +       return need_sleep;
49588 +}
49589 +
49590 +/*
49591 + * We avoid multiple worker processes conflicting via the balloon mutex.
49592 + * We may of course race updates of the target counts (which are protected
49593 + * by the balloon lock), or with changes to the Xen hard limit, but we will
49594 + * recover from these in time.
49595 + */
49596 +static void balloon_process(void *unused)
49597 +{
49598 +       int need_sleep = 0;
49599 +       long credit;
49600 +
49601 +       down(&balloon_mutex);
49602 +
49603 +       do {
49604 +               credit = current_target() - bs.current_pages;
49605 +               if (credit > 0)
49606 +                       need_sleep = (increase_reservation(credit) != 0);
49607 +               if (credit < 0)
49608 +                       need_sleep = (decrease_reservation(-credit) != 0);
49609 +
49610 +#ifndef CONFIG_PREEMPT
49611 +               if (need_resched())
49612 +                       schedule();
49613 +#endif
49614 +       } while ((credit != 0) && !need_sleep);
49615 +
49616 +       /* Schedule more work if there is some still to be done. */
49617 +       if (current_target() != bs.current_pages)
49618 +               mod_timer(&balloon_timer, jiffies + HZ);
49619 +
49620 +       up(&balloon_mutex);
49621 +}
49622 +
49623 +/* Resets the Xen limit, sets new target, and kicks off processing. */
49624 +void balloon_set_new_target(unsigned long target)
49625 +{
49626 +       /* No need for lock. Not read-modify-write updates. */
49627 +       bs.hard_limit   = ~0UL;
49628 +       bs.target_pages = target;
49629 +       schedule_work(&balloon_worker);
49630 +}
49631 +
49632 +static struct xenbus_watch target_watch =
49633 +{
49634 +       .node = "memory/target"
49635 +};
49636 +
49637 +/* React to a change in the target key */
49638 +static void watch_target(struct xenbus_watch *watch,
49639 +                        const char **vec, unsigned int len)
49640 +{
49641 +       unsigned long long new_target;
49642 +       int err;
49643 +
49644 +       err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
49645 +       if (err != 1) {
49646 +               /* This is ok (for domain0 at least) - so just return */
49647 +               return;
49648 +       }
49649 +
49650 +       /* The given memory/target value is in KiB, so it needs converting to
49651 +        * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
49652 +        */
49653 +       balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
49654 +}
49655 +
49656 +static int balloon_init_watcher(struct notifier_block *notifier,
49657 +                               unsigned long event,
49658 +                               void *data)
49659 +{
49660 +       int err;
49661 +
49662 +       err = register_xenbus_watch(&target_watch);
49663 +       if (err)
49664 +               printk(KERN_ERR "Failed to set balloon watcher\n");
49665 +
49666 +       return NOTIFY_DONE;
49667 +}
49668 +
49669 +#ifdef CONFIG_PROC_FS
49670 +static int balloon_write(struct file *file, const char __user *buffer,
49671 +                        unsigned long count, void *data)
49672 +{
49673 +       char memstring[64], *endchar;
49674 +       unsigned long long target_bytes;
49675 +
49676 +       if (!capable(CAP_SYS_ADMIN))
49677 +               return -EPERM;
49678 +
49679 +       if (count <= 1)
49680 +               return -EBADMSG; /* runt */
49681 +       if (count > sizeof(memstring))
49682 +               return -EFBIG;   /* too long */
49683 +
49684 +       if (copy_from_user(memstring, buffer, count))
49685 +               return -EFAULT;
49686 +       memstring[sizeof(memstring)-1] = '\0';
49687 +
49688 +       target_bytes = memparse(memstring, &endchar);
49689 +       balloon_set_new_target(target_bytes >> PAGE_SHIFT);
49690 +
49691 +       return count;
49692 +}
49693 +
49694 +static int balloon_read(char *page, char **start, off_t off,
49695 +                       int count, int *eof, void *data)
49696 +{
49697 +       int len;
49698 +
49699 +       len = sprintf(
49700 +               page,
49701 +               "Current allocation: %8lu kB\n"
49702 +               "Requested target:   %8lu kB\n"
49703 +               "Low-mem balloon:    %8lu kB\n"
49704 +               "High-mem balloon:   %8lu kB\n"
49705 +               "Driver pages:       %8lu kB\n"
49706 +               "Xen hard limit:     ",
49707 +               PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), 
49708 +               PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
49709 +               PAGES2KB(bs.driver_pages));
49710 +
49711 +       if (bs.hard_limit != ~0UL)
49712 +               len += sprintf(page + len, "%8lu kB\n",
49713 +                              PAGES2KB(bs.hard_limit));
49714 +       else
49715 +               len += sprintf(page + len, "     ??? kB\n");
49716 +
49717 +       *eof = 1;
49718 +       return len;
49719 +}
49720 +#endif
49721 +
49722 +static struct notifier_block xenstore_notifier;
49723 +
49724 +static int __init balloon_init(void)
49725 +{
49726 +       unsigned long pfn;
49727 +       struct page *page;
49728 +
49729 +       if (!is_running_on_xen())
49730 +               return -ENODEV;
49731 +
49732 +       IPRINTK("Initialising balloon driver.\n");
49733 +
49734 +       bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
49735 +       totalram_pages   = bs.current_pages;
49736 +       bs.target_pages  = bs.current_pages;
49737 +       bs.balloon_low   = 0;
49738 +       bs.balloon_high  = 0;
49739 +       bs.driver_pages  = 0UL;
49740 +       bs.hard_limit    = ~0UL;
49741 +
49742 +       init_timer(&balloon_timer);
49743 +       balloon_timer.data = 0;
49744 +       balloon_timer.function = balloon_alarm;
49745 +    
49746 +#ifdef CONFIG_PROC_FS
49747 +       if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
49748 +               WPRINTK("Unable to create /proc/xen/balloon.\n");
49749 +               return -1;
49750 +       }
49751 +
49752 +       balloon_pde->read_proc  = balloon_read;
49753 +       balloon_pde->write_proc = balloon_write;
49754 +#endif
49755 +       balloon_sysfs_init();
49756 +    
49757 +       /* Initialise the balloon with excess memory space. */
49758 +       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
49759 +               page = pfn_to_page(pfn);
49760 +               if (!PageReserved(page))
49761 +                       balloon_append(page);
49762 +       }
49763 +
49764 +       target_watch.callback = watch_target;
49765 +       xenstore_notifier.notifier_call = balloon_init_watcher;
49766 +
49767 +       register_xenstore_notifier(&xenstore_notifier);
49768 +    
49769 +       return 0;
49770 +}
49771 +
49772 +subsys_initcall(balloon_init);
49773 +
49774 +void balloon_update_driver_allowance(long delta)
49775 +{
49776 +       unsigned long flags;
49777 +
49778 +       balloon_lock(flags);
49779 +       bs.driver_pages += delta;
49780 +       balloon_unlock(flags);
49781 +}
49782 +
49783 +static int dealloc_pte_fn(
49784 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
49785 +{
49786 +       unsigned long mfn = pte_mfn(*pte);
49787 +       int ret;
49788 +       struct xen_memory_reservation reservation = {
49789 +               .nr_extents   = 1,
49790 +               .extent_order = 0,
49791 +               .domid        = DOMID_SELF
49792 +       };
49793 +       set_xen_guest_handle(reservation.extent_start, &mfn);
49794 +       set_pte_at(&init_mm, addr, pte, __pte_ma(0));
49795 +       set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
49796 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
49797 +       BUG_ON(ret != 1);
49798 +       return 0;
49799 +}
49800 +
49801 +struct page **alloc_empty_pages_and_pagevec(int nr_pages)
49802 +{
49803 +       unsigned long vaddr, flags;
49804 +       struct page *page, **pagevec;
49805 +       int i, ret;
49806 +
49807 +       pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
49808 +       if (pagevec == NULL)
49809 +               return NULL;
49810 +
49811 +       for (i = 0; i < nr_pages; i++) {
49812 +               page = pagevec[i] = alloc_page(GFP_KERNEL);
49813 +               if (page == NULL)
49814 +                       goto err;
49815 +
49816 +               vaddr = (unsigned long)page_address(page);
49817 +
49818 +               scrub_pages(vaddr, 1);
49819 +
49820 +               balloon_lock(flags);
49821 +
49822 +               if (xen_feature(XENFEAT_auto_translated_physmap)) {
49823 +                       unsigned long gmfn = page_to_pfn(page);
49824 +                       struct xen_memory_reservation reservation = {
49825 +                               .nr_extents   = 1,
49826 +                               .extent_order = 0,
49827 +                               .domid        = DOMID_SELF
49828 +                       };
49829 +                       set_xen_guest_handle(reservation.extent_start, &gmfn);
49830 +                       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
49831 +                                                  &reservation);
49832 +                       if (ret == 1)
49833 +                               ret = 0; /* success */
49834 +               } else {
49835 +                       ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
49836 +                                                 dealloc_pte_fn, NULL);
49837 +               }
49838 +
49839 +               if (ret != 0) {
49840 +                       balloon_unlock(flags);
49841 +                       __free_page(page);
49842 +                       goto err;
49843 +               }
49844 +
49845 +               totalram_pages = --bs.current_pages;
49846 +
49847 +               balloon_unlock(flags);
49848 +       }
49849 +
49850 + out:
49851 +       schedule_work(&balloon_worker);
49852 +       flush_tlb_all();
49853 +       return pagevec;
49854 +
49855 + err:
49856 +       balloon_lock(flags);
49857 +       while (--i >= 0)
49858 +               balloon_append(pagevec[i]);
49859 +       balloon_unlock(flags);
49860 +       kfree(pagevec);
49861 +       pagevec = NULL;
49862 +       goto out;
49863 +}
49864 +
49865 +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
49866 +{
49867 +       unsigned long flags;
49868 +       int i;
49869 +
49870 +       if (pagevec == NULL)
49871 +               return;
49872 +
49873 +       balloon_lock(flags);
49874 +       for (i = 0; i < nr_pages; i++) {
49875 +               BUG_ON(page_count(pagevec[i]) != 1);
49876 +               balloon_append(pagevec[i]);
49877 +       }
49878 +       balloon_unlock(flags);
49879 +
49880 +       kfree(pagevec);
49881 +
49882 +       schedule_work(&balloon_worker);
49883 +}
49884 +
49885 +void balloon_release_driver_page(struct page *page)
49886 +{
49887 +       unsigned long flags;
49888 +
49889 +       balloon_lock(flags);
49890 +       balloon_append(page);
49891 +       bs.driver_pages--;
49892 +       balloon_unlock(flags);
49893 +
49894 +       schedule_work(&balloon_worker);
49895 +}
49896 +
49897 +EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
49898 +EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
49899 +EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
49900 +EXPORT_SYMBOL_GPL(balloon_release_driver_page);
49901 +
49902 +MODULE_LICENSE("Dual BSD/GPL");
49903 diff -ruNp linux-2.6.19/drivers/xen/balloon/common.h linux-2.6.19-xen-3.0.4/drivers/xen/balloon/common.h
49904 --- linux-2.6.19/drivers/xen/balloon/common.h   1970-01-01 00:00:00.000000000 +0000
49905 +++ linux-2.6.19-xen-3.0.4/drivers/xen/balloon/common.h 2007-02-02 19:10:45.000000000 +0000
49906 @@ -0,0 +1,58 @@
49907 +/******************************************************************************
49908 + * balloon/common.h
49909 + *
49910 + * This program is free software; you can redistribute it and/or
49911 + * modify it under the terms of the GNU General Public License version 2
49912 + * as published by the Free Software Foundation; or, when distributed
49913 + * separately from the Linux kernel or incorporated into other
49914 + * software packages, subject to the following license:
49915 + * 
49916 + * Permission is hereby granted, free of charge, to any person obtaining a copy
49917 + * of this source file (the "Software"), to deal in the Software without
49918 + * restriction, including without limitation the rights to use, copy, modify,
49919 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
49920 + * and to permit persons to whom the Software is furnished to do so, subject to
49921 + * the following conditions:
49922 + * 
49923 + * The above copyright notice and this permission notice shall be included in
49924 + * all copies or substantial portions of the Software.
49925 + * 
49926 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49927 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49928 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49929 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49930 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
49931 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
49932 + * IN THE SOFTWARE.
49933 + */
49934 +
49935 +#ifndef __XEN_BALLOON_COMMON_H__
49936 +#define __XEN_BALLOON_COMMON_H__
49937 +
49938 +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
49939 +
49940 +struct balloon_stats {
49941 +       /* We aim for 'current allocation' == 'target allocation'. */
49942 +       unsigned long current_pages;
49943 +       unsigned long target_pages;
49944 +       /* We may hit the hard limit in Xen. If we do then we remember it. */
49945 +       unsigned long hard_limit;
49946 +       /*
49947 +        * Drivers may alter the memory reservation independently, but they
49948 +        * must inform the balloon driver so we avoid hitting the hard limit.
49949 +        */
49950 +       unsigned long driver_pages;
49951 +       /* Number of pages in high- and low-memory balloons. */
49952 +       unsigned long balloon_low;
49953 +       unsigned long balloon_high;
49954 +};
49955 +
49956 +extern struct balloon_stats balloon_stats;
49957 +#define bs balloon_stats
49958 +
49959 +int balloon_sysfs_init(void);
49960 +void balloon_sysfs_exit(void);
49961 +
49962 +void balloon_set_new_target(unsigned long target);
49963 +
49964 +#endif /* __XEN_BALLOON_COMMON_H__ */
49965 diff -ruNp linux-2.6.19/drivers/xen/balloon/sysfs.c linux-2.6.19-xen-3.0.4/drivers/xen/balloon/sysfs.c
49966 --- linux-2.6.19/drivers/xen/balloon/sysfs.c    1970-01-01 00:00:00.000000000 +0000
49967 +++ linux-2.6.19-xen-3.0.4/drivers/xen/balloon/sysfs.c  2007-02-02 19:10:45.000000000 +0000
49968 @@ -0,0 +1,164 @@
49969 +/******************************************************************************
49970 + * balloon/sysfs.c
49971 + *
49972 + * Xen balloon driver - sysfs interfaces.
49973 + * 
49974 + * This program is free software; you can redistribute it and/or
49975 + * modify it under the terms of the GNU General Public License version 2
49976 + * as published by the Free Software Foundation; or, when distributed
49977 + * separately from the Linux kernel or incorporated into other
49978 + * software packages, subject to the following license:
49979 + * 
49980 + * Permission is hereby granted, free of charge, to any person obtaining a copy
49981 + * of this source file (the "Software"), to deal in the Software without
49982 + * restriction, including without limitation the rights to use, copy, modify,
49983 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
49984 + * and to permit persons to whom the Software is furnished to do so, subject to
49985 + * the following conditions:
49986 + * 
49987 + * The above copyright notice and this permission notice shall be included in
49988 + * all copies or substantial portions of the Software.
49989 + * 
49990 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49991 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49992 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49993 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49994 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
49995 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
49996 + * IN THE SOFTWARE.
49997 + */
49998 +
49999 +#include <linux/capability.h>
50000 +#include <linux/stat.h>
50001 +#include <linux/sysdev.h>
50002 +#include "common.h"
50003 +
50004 +#define BALLOON_CLASS_NAME "memory"
50005 +
50006 +#define BALLOON_SHOW(name, format, args...)                    \
50007 +       static ssize_t show_##name(struct sys_device *dev,      \
50008 +                                  char *buf)                   \
50009 +       {                                                       \
50010 +               return sprintf(buf, format, ##args);            \
50011 +       }                                                       \
50012 +       static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
50013 +
50014 +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
50015 +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
50016 +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
50017 +BALLOON_SHOW(hard_limit_kb,
50018 +            (bs.hard_limit!=~0UL) ? "%lu\n" : "???\n",
50019 +            (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
50020 +BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
50021 +
50022 +static ssize_t show_target_kb(struct sys_device *dev, char *buf)
50023 +{
50024 +       return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
50025 +}
50026 +
50027 +static ssize_t store_target_kb(struct sys_device *dev,
50028 +                              const char *buf,
50029 +                              size_t count)
50030 +{
50031 +       char memstring[64], *endchar;
50032 +       unsigned long long target_bytes;
50033 +
50034 +       if (!capable(CAP_SYS_ADMIN))
50035 +               return -EPERM;
50036 +       
50037 +       if (count <= 1)
50038 +               return -EBADMSG; /* runt */
50039 +       if (count > sizeof(memstring))
50040 +               return -EFBIG;   /* too long */
50041 +       strcpy(memstring, buf);
50042 +       
50043 +       target_bytes = memparse(memstring, &endchar);
50044 +       balloon_set_new_target(target_bytes >> PAGE_SHIFT);
50045 +       
50046 +       return count;
50047 +}
50048 +
50049 +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
50050 +                  show_target_kb, store_target_kb);
50051 +
50052 +static struct sysdev_attribute *balloon_attrs[] = {
50053 +       &attr_target_kb,
50054 +};
50055 +
50056 +static struct attribute *balloon_info_attrs[] = {
50057 +       &attr_current_kb.attr,
50058 +       &attr_low_kb.attr,
50059 +       &attr_high_kb.attr,
50060 +       &attr_hard_limit_kb.attr,
50061 +       &attr_driver_kb.attr,
50062 +       NULL
50063 +};
50064 +
50065 +static struct attribute_group balloon_info_group = {
50066 +       .name = "info",
50067 +       .attrs = balloon_info_attrs,
50068 +};
50069 +
50070 +static struct sysdev_class balloon_sysdev_class = {
50071 +       set_kset_name(BALLOON_CLASS_NAME),
50072 +};
50073 +
50074 +static struct sys_device balloon_sysdev;
50075 +
50076 +static int register_balloon(struct sys_device *sysdev)
50077 +{
50078 +       int i, error;
50079 +
50080 +       error = sysdev_class_register(&balloon_sysdev_class);
50081 +       if (error)
50082 +               return error;
50083 +
50084 +       sysdev->id = 0;
50085 +       sysdev->cls = &balloon_sysdev_class;
50086 +
50087 +       error = sysdev_register(sysdev);
50088 +       if (error) {
50089 +               sysdev_class_unregister(&balloon_sysdev_class);
50090 +               return error;
50091 +       }
50092 +
50093 +       for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
50094 +               error = sysdev_create_file(sysdev, balloon_attrs[i]);
50095 +               if (error)
50096 +                       goto fail;
50097 +       }
50098 +
50099 +       error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
50100 +       if (error)
50101 +               goto fail;
50102 +       
50103 +       return 0;
50104 +
50105 + fail:
50106 +       while (--i >= 0)
50107 +               sysdev_remove_file(sysdev, balloon_attrs[i]);
50108 +       sysdev_unregister(sysdev);
50109 +       sysdev_class_unregister(&balloon_sysdev_class);
50110 +       return error;
50111 +}
50112 +
50113 +static void unregister_balloon(struct sys_device *sysdev)
50114 +{
50115 +       int i;
50116 +
50117 +       sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
50118 +       for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
50119 +               sysdev_remove_file(sysdev, balloon_attrs[i]);
50120 +       sysdev_unregister(sysdev);
50121 +       sysdev_class_unregister(&balloon_sysdev_class);
50122 +}
50123 +
50124 +int balloon_sysfs_init(void)
50125 +{
50126 +       return register_balloon(&balloon_sysdev);
50127 +}
50128 +
50129 +void balloon_sysfs_exit(void)
50130 +{
50131 +       unregister_balloon(&balloon_sysdev);
50132 +}
50133 diff -ruNp linux-2.6.19/drivers/xen/blkback/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/blkback/Makefile
50134 --- linux-2.6.19/drivers/xen/blkback/Makefile   1970-01-01 00:00:00.000000000 +0000
50135 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkback/Makefile 2007-02-02 19:10:45.000000000 +0000
50136 @@ -0,0 +1,3 @@
50137 +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
50138 +
50139 +blkbk-y        := blkback.o xenbus.o interface.o vbd.o
50140 diff -ruNp linux-2.6.19/drivers/xen/blkback/blkback.c linux-2.6.19-xen-3.0.4/drivers/xen/blkback/blkback.c
50141 --- linux-2.6.19/drivers/xen/blkback/blkback.c  1970-01-01 00:00:00.000000000 +0000
50142 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkback/blkback.c        2007-02-02 19:10:45.000000000 +0000
50143 @@ -0,0 +1,581 @@
50144 +/******************************************************************************
50145 + * arch/xen/drivers/blkif/backend/main.c
50146 + * 
50147 + * Back-end of the driver for virtual block devices. This portion of the
50148 + * driver exports a 'unified' block-device interface that can be accessed
50149 + * by any operating system that implements a compatible front end. A 
50150 + * reference front-end implementation can be found in:
50151 + *  arch/xen/drivers/blkif/frontend
50152 + * 
50153 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
50154 + * Copyright (c) 2005, Christopher Clark
50155 + * 
50156 + * This program is free software; you can redistribute it and/or
50157 + * modify it under the terms of the GNU General Public License version 2
50158 + * as published by the Free Software Foundation; or, when distributed
50159 + * separately from the Linux kernel or incorporated into other
50160 + * software packages, subject to the following license:
50161 + * 
50162 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50163 + * of this source file (the "Software"), to deal in the Software without
50164 + * restriction, including without limitation the rights to use, copy, modify,
50165 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50166 + * and to permit persons to whom the Software is furnished to do so, subject to
50167 + * the following conditions:
50168 + * 
50169 + * The above copyright notice and this permission notice shall be included in
50170 + * all copies or substantial portions of the Software.
50171 + * 
50172 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50173 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50174 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50175 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50176 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50177 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50178 + * IN THE SOFTWARE.
50179 + */
50180 +
50181 +#include <linux/spinlock.h>
50182 +#include <linux/kthread.h>
50183 +#include <linux/list.h>
50184 +#include <xen/balloon.h>
50185 +#include <asm/hypervisor.h>
50186 +#include <asm/hypercall.h>
50187 +#include "common.h"
50188 +
50189 +/*
50190 + * These are rather arbitrary. They are fairly large because adjacent requests
50191 + * pulled from a communication ring are quite likely to end up being part of
50192 + * the same scatter/gather request at the disc.
50193 + * 
50194 + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
50195 + * 
50196 + * This will increase the chances of being able to write whole tracks.
50197 + * 64 should be enough to keep us competitive with Linux.
50198 + */
50199 +static int blkif_reqs = 64;
50200 +module_param_named(reqs, blkif_reqs, int, 0);
50201 +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
50202 +
50203 +/* Run-time switchable: /sys/module/blkback/parameters/ */
50204 +static unsigned int log_stats = 0;
50205 +static unsigned int debug_lvl = 0;
50206 +module_param(log_stats, int, 0644);
50207 +module_param(debug_lvl, int, 0644);
50208 +
50209 +/*
50210 + * Each outstanding request that we've passed to the lower device layers has a 
50211 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
50212 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
50213 + * response queued for it, with the saved 'id' passed back.
50214 + */
50215 +typedef struct {
50216 +       blkif_t       *blkif;
50217 +       unsigned long  id;
50218 +       int            nr_pages;
50219 +       atomic_t       pendcnt;
50220 +       unsigned short operation;
50221 +       int            status;
50222 +       struct list_head free_list;
50223 +} pending_req_t;
50224 +
50225 +static pending_req_t *pending_reqs;
50226 +static struct list_head pending_free;
50227 +static DEFINE_SPINLOCK(pending_free_lock);
50228 +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
50229 +
50230 +#define BLKBACK_INVALID_HANDLE (~0)
50231 +
50232 +static struct page **pending_pages;
50233 +static grant_handle_t *pending_grant_handles;
50234 +
50235 +static inline int vaddr_pagenr(pending_req_t *req, int seg)
50236 +{
50237 +       return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
50238 +}
50239 +
50240 +static inline unsigned long vaddr(pending_req_t *req, int seg)
50241 +{
50242 +       unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
50243 +       return (unsigned long)pfn_to_kaddr(pfn);
50244 +}
50245 +
50246 +#define pending_handle(_req, _seg) \
50247 +       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
50248 +
50249 +
50250 +static int do_block_io_op(blkif_t *blkif);
50251 +static void dispatch_rw_block_io(blkif_t *blkif,
50252 +                                blkif_request_t *req,
50253 +                                pending_req_t *pending_req);
50254 +static void make_response(blkif_t *blkif, unsigned long id, 
50255 +                         unsigned short op, int st);
50256 +
50257 +/******************************************************************
50258 + * misc small helpers
50259 + */
50260 +static pending_req_t* alloc_req(void)
50261 +{
50262 +       pending_req_t *req = NULL;
50263 +       unsigned long flags;
50264 +
50265 +       spin_lock_irqsave(&pending_free_lock, flags);
50266 +       if (!list_empty(&pending_free)) {
50267 +               req = list_entry(pending_free.next, pending_req_t, free_list);
50268 +               list_del(&req->free_list);
50269 +       }
50270 +       spin_unlock_irqrestore(&pending_free_lock, flags);
50271 +       return req;
50272 +}
50273 +
50274 +static void free_req(pending_req_t *req)
50275 +{
50276 +       unsigned long flags;
50277 +       int was_empty;
50278 +
50279 +       spin_lock_irqsave(&pending_free_lock, flags);
50280 +       was_empty = list_empty(&pending_free);
50281 +       list_add(&req->free_list, &pending_free);
50282 +       spin_unlock_irqrestore(&pending_free_lock, flags);
50283 +       if (was_empty)
50284 +               wake_up(&pending_free_wq);
50285 +}
50286 +
50287 +static void unplug_queue(blkif_t *blkif)
50288 +{
50289 +       if (blkif->plug == NULL)
50290 +               return;
50291 +       if (blkif->plug->unplug_fn)
50292 +               blkif->plug->unplug_fn(blkif->plug);
50293 +       blk_put_queue(blkif->plug);
50294 +       blkif->plug = NULL;
50295 +}
50296 +
50297 +static void plug_queue(blkif_t *blkif, struct bio *bio)
50298 +{
50299 +       request_queue_t *q = bdev_get_queue(bio->bi_bdev);
50300 +
50301 +       if (q == blkif->plug)
50302 +               return;
50303 +       unplug_queue(blkif);
50304 +       blk_get_queue(q);
50305 +       blkif->plug = q;
50306 +}
50307 +
50308 +static void fast_flush_area(pending_req_t *req)
50309 +{
50310 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
50311 +       unsigned int i, invcount = 0;
50312 +       grant_handle_t handle;
50313 +       int ret;
50314 +
50315 +       for (i = 0; i < req->nr_pages; i++) {
50316 +               handle = pending_handle(req, i);
50317 +               if (handle == BLKBACK_INVALID_HANDLE)
50318 +                       continue;
50319 +               gnttab_set_unmap_op(&unmap[i], vaddr(req, i), GNTMAP_host_map,
50320 +                                   handle);
50321 +               pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
50322 +               invcount++;
50323 +       }
50324 +
50325 +       ret = HYPERVISOR_grant_table_op(
50326 +               GNTTABOP_unmap_grant_ref, unmap, invcount);
50327 +       BUG_ON(ret);
50328 +}
50329 +
50330 +/******************************************************************
50331 + * SCHEDULER FUNCTIONS
50332 + */
50333 +
50334 +static void print_stats(blkif_t *blkif)
50335 +{
50336 +       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
50337 +              current->comm, blkif->st_oo_req,
50338 +              blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
50339 +       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
50340 +       blkif->st_rd_req = 0;
50341 +       blkif->st_wr_req = 0;
50342 +       blkif->st_oo_req = 0;
50343 +}
50344 +
50345 +int blkif_schedule(void *arg)
50346 +{
50347 +       blkif_t *blkif = arg;
50348 +
50349 +       blkif_get(blkif);
50350 +
50351 +       if (debug_lvl)
50352 +               printk(KERN_DEBUG "%s: started\n", current->comm);
50353 +
50354 +       while (!kthread_should_stop()) {
50355 +               wait_event_interruptible(
50356 +                       blkif->wq,
50357 +                       blkif->waiting_reqs || kthread_should_stop());
50358 +               wait_event_interruptible(
50359 +                       pending_free_wq,
50360 +                       !list_empty(&pending_free) || kthread_should_stop());
50361 +
50362 +               blkif->waiting_reqs = 0;
50363 +               smp_mb(); /* clear flag *before* checking for work */
50364 +
50365 +               if (do_block_io_op(blkif))
50366 +                       blkif->waiting_reqs = 1;
50367 +               unplug_queue(blkif);
50368 +
50369 +               if (log_stats && time_after(jiffies, blkif->st_print))
50370 +                       print_stats(blkif);
50371 +       }
50372 +
50373 +       if (log_stats)
50374 +               print_stats(blkif);
50375 +       if (debug_lvl)
50376 +               printk(KERN_DEBUG "%s: exiting\n", current->comm);
50377 +
50378 +       blkif->xenblkd = NULL;
50379 +       blkif_put(blkif);
50380 +
50381 +       return 0;
50382 +}
50383 +
50384 +/******************************************************************
50385 + * COMPLETION CALLBACK -- Called as bh->b_end_io()
50386 + */
50387 +
50388 +static void __end_block_io_op(pending_req_t *pending_req, int error)
50389 +{
50390 +       /* An error fails the entire request. */
50391 +       if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
50392 +           (error == -EOPNOTSUPP)) {
50393 +               DPRINTK("blkback: write barrier op failed, not supported\n");
50394 +               blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
50395 +               pending_req->status = BLKIF_RSP_EOPNOTSUPP;
50396 +       } else if (error) {
50397 +               DPRINTK("Buffer not up-to-date at end of operation, "
50398 +                       "error=%d\n", error);
50399 +               pending_req->status = BLKIF_RSP_ERROR;
50400 +       }
50401 +
50402 +       if (atomic_dec_and_test(&pending_req->pendcnt)) {
50403 +               fast_flush_area(pending_req);
50404 +               make_response(pending_req->blkif, pending_req->id,
50405 +                             pending_req->operation, pending_req->status);
50406 +               blkif_put(pending_req->blkif);
50407 +               free_req(pending_req);
50408 +       }
50409 +}
50410 +
50411 +static int end_block_io_op(struct bio *bio, unsigned int done, int error)
50412 +{
50413 +       if (bio->bi_size != 0)
50414 +               return 1;
50415 +       __end_block_io_op(bio->bi_private, error);
50416 +       bio_put(bio);
50417 +       return error;
50418 +}
50419 +
50420 +
50421 +/******************************************************************************
50422 + * NOTIFICATION FROM GUEST OS.
50423 + */
50424 +
50425 +static void blkif_notify_work(blkif_t *blkif)
50426 +{
50427 +       blkif->waiting_reqs = 1;
50428 +       wake_up(&blkif->wq);
50429 +}
50430 +
50431 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
50432 +{
50433 +       blkif_notify_work(dev_id);
50434 +       return IRQ_HANDLED;
50435 +}
50436 +
50437 +
50438 +
50439 +/******************************************************************
50440 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
50441 + */
50442 +
50443 +static int do_block_io_op(blkif_t *blkif)
50444 +{
50445 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
50446 +       blkif_request_t req;
50447 +       pending_req_t *pending_req;
50448 +       RING_IDX rc, rp;
50449 +       int more_to_do = 0;
50450 +
50451 +       rc = blk_ring->req_cons;
50452 +       rp = blk_ring->sring->req_prod;
50453 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
50454 +
50455 +       while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
50456 +
50457 +               pending_req = alloc_req();
50458 +               if (NULL == pending_req) {
50459 +                       blkif->st_oo_req++;
50460 +                       more_to_do = 1;
50461 +                       break;
50462 +               }
50463 +
50464 +               memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
50465 +               blk_ring->req_cons = ++rc; /* before make_response() */
50466 +
50467 +               switch (req.operation) {
50468 +               case BLKIF_OP_READ:
50469 +                       blkif->st_rd_req++;
50470 +                       dispatch_rw_block_io(blkif, &req, pending_req);
50471 +                       break;
50472 +               case BLKIF_OP_WRITE_BARRIER:
50473 +                       blkif->st_br_req++;
50474 +                       /* fall through */
50475 +               case BLKIF_OP_WRITE:
50476 +                       blkif->st_wr_req++;
50477 +                       dispatch_rw_block_io(blkif, &req, pending_req);
50478 +                       break;
50479 +               default:
50480 +                       DPRINTK("error: unknown block io operation [%d]\n",
50481 +                               req.operation);
50482 +                       make_response(blkif, req.id, req.operation,
50483 +                                     BLKIF_RSP_ERROR);
50484 +                       free_req(pending_req);
50485 +                       break;
50486 +               }
50487 +       }
50488 +       return more_to_do;
50489 +}
50490 +
50491 +static void dispatch_rw_block_io(blkif_t *blkif,
50492 +                                blkif_request_t *req,
50493 +                                pending_req_t *pending_req)
50494 +{
50495 +       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
50496 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
50497 +       struct phys_req preq;
50498 +       struct { 
50499 +               unsigned long buf; unsigned int nsec;
50500 +       } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
50501 +       unsigned int nseg;
50502 +       struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
50503 +       int ret, i, nbio = 0;
50504 +       int operation;
50505 +
50506 +       switch (req->operation) {
50507 +       case BLKIF_OP_READ:
50508 +               operation = READ;
50509 +               break;
50510 +       case BLKIF_OP_WRITE:
50511 +               operation = WRITE;
50512 +               break;
50513 +       case BLKIF_OP_WRITE_BARRIER:
50514 +               operation = WRITE_BARRIER;
50515 +               break;
50516 +       default:
50517 +               operation = 0; /* make gcc happy */
50518 +               BUG();
50519 +       }
50520 +
50521 +       /* Check that number of segments is sane. */
50522 +       nseg = req->nr_segments;
50523 +       if (unlikely(nseg == 0) || 
50524 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
50525 +               DPRINTK("Bad number of segments in request (%d)\n", nseg);
50526 +               goto fail_response;
50527 +       }
50528 +
50529 +       preq.dev           = req->handle;
50530 +       preq.sector_number = req->sector_number;
50531 +       preq.nr_sects      = 0;
50532 +
50533 +       pending_req->blkif     = blkif;
50534 +       pending_req->id        = req->id;
50535 +       pending_req->operation = req->operation;
50536 +       pending_req->status    = BLKIF_RSP_OKAY;
50537 +       pending_req->nr_pages  = nseg;
50538 +
50539 +       for (i = 0; i < nseg; i++) {
50540 +               uint32_t flags;
50541 +
50542 +               seg[i].nsec = req->seg[i].last_sect -
50543 +                       req->seg[i].first_sect + 1;
50544 +
50545 +               if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
50546 +                   (req->seg[i].last_sect < req->seg[i].first_sect))
50547 +                       goto fail_response;
50548 +               preq.nr_sects += seg[i].nsec;
50549 +
50550 +               flags = GNTMAP_host_map;
50551 +               if (operation != READ)
50552 +                       flags |= GNTMAP_readonly;
50553 +               gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
50554 +                                 req->seg[i].gref, blkif->domid);
50555 +       }
50556 +
50557 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
50558 +       BUG_ON(ret);
50559 +
50560 +       for (i = 0; i < nseg; i++) {
50561 +               if (unlikely(map[i].status != 0)) {
50562 +                       DPRINTK("invalid buffer -- could not remap it\n");
50563 +                       map[i].handle = BLKBACK_INVALID_HANDLE;
50564 +                       ret |= 1;
50565 +               }
50566 +
50567 +               pending_handle(pending_req, i) = map[i].handle;
50568 +
50569 +               if (ret)
50570 +                       continue;
50571 +
50572 +               set_phys_to_machine(__pa(vaddr(
50573 +                       pending_req, i)) >> PAGE_SHIFT,
50574 +                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
50575 +               seg[i].buf  = map[i].dev_bus_addr | 
50576 +                       (req->seg[i].first_sect << 9);
50577 +       }
50578 +
50579 +       if (ret)
50580 +               goto fail_flush;
50581 +
50582 +       if (vbd_translate(&preq, blkif, operation) != 0) {
50583 +               DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
50584 +                       operation == READ ? "read" : "write",
50585 +                       preq.sector_number,
50586 +                       preq.sector_number + preq.nr_sects, preq.dev);
50587 +               goto fail_flush;
50588 +       }
50589 +
50590 +       for (i = 0; i < nseg; i++) {
50591 +               if (((int)preq.sector_number|(int)seg[i].nsec) &
50592 +                   ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
50593 +                       DPRINTK("Misaligned I/O request from domain %d",
50594 +                               blkif->domid);
50595 +                       goto fail_put_bio;
50596 +               }
50597 +
50598 +               while ((bio == NULL) ||
50599 +                      (bio_add_page(bio,
50600 +                                    virt_to_page(vaddr(pending_req, i)),
50601 +                                    seg[i].nsec << 9,
50602 +                                    seg[i].buf & ~PAGE_MASK) == 0)) {
50603 +                       bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
50604 +                       if (unlikely(bio == NULL))
50605 +                               goto fail_put_bio;
50606 +
50607 +                       bio->bi_bdev    = preq.bdev;
50608 +                       bio->bi_private = pending_req;
50609 +                       bio->bi_end_io  = end_block_io_op;
50610 +                       bio->bi_sector  = preq.sector_number;
50611 +               }
50612 +
50613 +               preq.sector_number += seg[i].nsec;
50614 +       }
50615 +
50616 +       plug_queue(blkif, bio);
50617 +       atomic_set(&pending_req->pendcnt, nbio);
50618 +       blkif_get(blkif);
50619 +
50620 +       for (i = 0; i < nbio; i++)
50621 +               submit_bio(operation, biolist[i]);
50622 +
50623 +       return;
50624 +
50625 + fail_put_bio:
50626 +       for (i = 0; i < (nbio-1); i++)
50627 +               bio_put(biolist[i]);
50628 + fail_flush:
50629 +       fast_flush_area(pending_req);
50630 + fail_response:
50631 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
50632 +       free_req(pending_req);
50633 +} 
50634 +
50635 +
50636 +
50637 +/******************************************************************
50638 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
50639 + */
50640 +
50641 +
50642 +static void make_response(blkif_t *blkif, unsigned long id, 
50643 +                         unsigned short op, int st)
50644 +{
50645 +       blkif_response_t *resp;
50646 +       unsigned long     flags;
50647 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
50648 +       int more_to_do = 0;
50649 +       int notify;
50650 +
50651 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
50652 +
50653 +       /* Place on the response ring for the relevant domain. */ 
50654 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
50655 +       resp->id        = id;
50656 +       resp->operation = op;
50657 +       resp->status    = st;
50658 +       blk_ring->rsp_prod_pvt++;
50659 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
50660 +
50661 +       if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
50662 +               /*
50663 +                * Tail check for pending requests. Allows frontend to avoid
50664 +                * notifications if requests are already in flight (lower
50665 +                * overheads and promotes batching).
50666 +                */
50667 +               RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
50668 +
50669 +       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
50670 +               more_to_do = 1;
50671 +
50672 +       }
50673 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
50674 +
50675 +       if (more_to_do)
50676 +               blkif_notify_work(blkif);
50677 +       if (notify)
50678 +               notify_remote_via_irq(blkif->irq);
50679 +}
50680 +
50681 +static int __init blkif_init(void)
50682 +{
50683 +       int i, mmap_pages;
50684 +
50685 +       if (!is_running_on_xen())
50686 +               return -ENODEV;
50687 +
50688 +       mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
50689 +
50690 +       pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
50691 +                                       blkif_reqs, GFP_KERNEL);
50692 +       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
50693 +                                       mmap_pages, GFP_KERNEL);
50694 +       pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
50695 +
50696 +       if (!pending_reqs || !pending_grant_handles || !pending_pages)
50697 +               goto out_of_memory;
50698 +
50699 +       for (i = 0; i < mmap_pages; i++)
50700 +               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
50701 +
50702 +       blkif_interface_init();
50703 +
50704 +       memset(pending_reqs, 0, sizeof(pending_reqs));
50705 +       INIT_LIST_HEAD(&pending_free);
50706 +
50707 +       for (i = 0; i < blkif_reqs; i++)
50708 +               list_add_tail(&pending_reqs[i].free_list, &pending_free);
50709 +
50710 +       blkif_xenbus_init();
50711 +
50712 +       return 0;
50713 +
50714 + out_of_memory:
50715 +       kfree(pending_reqs);
50716 +       kfree(pending_grant_handles);
50717 +       free_empty_pages_and_pagevec(pending_pages, mmap_pages);
50718 +       printk("%s: out of memory\n", __FUNCTION__);
50719 +       return -ENOMEM;
50720 +}
50721 +
50722 +module_init(blkif_init);
50723 +
50724 +MODULE_LICENSE("Dual BSD/GPL");
50725 diff -ruNp linux-2.6.19/drivers/xen/blkback/common.h linux-2.6.19-xen-3.0.4/drivers/xen/blkback/common.h
50726 --- linux-2.6.19/drivers/xen/blkback/common.h   1970-01-01 00:00:00.000000000 +0000
50727 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkback/common.h 2007-02-02 19:10:45.000000000 +0000
50728 @@ -0,0 +1,138 @@
50729 +/* 
50730 + * This program is free software; you can redistribute it and/or
50731 + * modify it under the terms of the GNU General Public License version 2
50732 + * as published by the Free Software Foundation; or, when distributed
50733 + * separately from the Linux kernel or incorporated into other
50734 + * software packages, subject to the following license:
50735 + * 
50736 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50737 + * of this source file (the "Software"), to deal in the Software without
50738 + * restriction, including without limitation the rights to use, copy, modify,
50739 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50740 + * and to permit persons to whom the Software is furnished to do so, subject to
50741 + * the following conditions:
50742 + * 
50743 + * The above copyright notice and this permission notice shall be included in
50744 + * all copies or substantial portions of the Software.
50745 + * 
50746 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50747 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50748 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50749 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50750 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50751 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50752 + * IN THE SOFTWARE.
50753 + */
50754 +
50755 +#ifndef __BLKIF__BACKEND__COMMON_H__
50756 +#define __BLKIF__BACKEND__COMMON_H__
50757 +
50758 +#include <linux/version.h>
50759 +#include <linux/module.h>
50760 +#include <linux/interrupt.h>
50761 +#include <linux/slab.h>
50762 +#include <linux/blkdev.h>
50763 +#include <linux/vmalloc.h>
50764 +#include <linux/wait.h>
50765 +#include <asm/io.h>
50766 +#include <asm/setup.h>
50767 +#include <asm/pgalloc.h>
50768 +#include <xen/evtchn.h>
50769 +#include <asm/hypervisor.h>
50770 +#include <xen/interface/io/blkif.h>
50771 +#include <xen/interface/io/ring.h>
50772 +#include <xen/gnttab.h>
50773 +#include <xen/driver_util.h>
50774 +#include <xen/xenbus.h>
50775 +
50776 +#define DPRINTK(_f, _a...)                     \
50777 +       pr_debug("(file=%s, line=%d) " _f,      \
50778 +                __FILE__ , __LINE__ , ## _a )
50779 +
50780 +struct vbd {
50781 +       blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
50782 +       unsigned char  readonly;    /* Non-zero -> read-only */
50783 +       unsigned char  type;        /* VDISK_xxx */
50784 +       u32            pdevice;     /* phys device that this vbd maps to */
50785 +       struct block_device *bdev;
50786 +};
50787 +
50788 +struct backend_info;
50789 +
50790 +typedef struct blkif_st {
50791 +       /* Unique identifier for this interface. */
50792 +       domid_t           domid;
50793 +       unsigned int      handle;
50794 +       /* Physical parameters of the comms window. */
50795 +       unsigned int      evtchn;
50796 +       unsigned int      irq;
50797 +       /* Comms information. */
50798 +       blkif_back_ring_t blk_ring;
50799 +       struct vm_struct *blk_ring_area;
50800 +       /* The VBD attached to this interface. */
50801 +       struct vbd        vbd;
50802 +       /* Back pointer to the backend_info. */
50803 +       struct backend_info *be;
50804 +       /* Private fields. */
50805 +       spinlock_t       blk_ring_lock;
50806 +       atomic_t         refcnt;
50807 +
50808 +       wait_queue_head_t   wq;
50809 +       struct task_struct  *xenblkd;
50810 +       unsigned int        waiting_reqs;
50811 +       request_queue_t     *plug;
50812 +
50813 +       /* statistics */
50814 +       unsigned long       st_print;
50815 +       int                 st_rd_req;
50816 +       int                 st_wr_req;
50817 +       int                 st_oo_req;
50818 +       int                 st_br_req;
50819 +
50820 +       wait_queue_head_t waiting_to_free;
50821 +
50822 +       grant_handle_t shmem_handle;
50823 +       grant_ref_t    shmem_ref;
50824 +} blkif_t;
50825 +
50826 +blkif_t *blkif_alloc(domid_t domid);
50827 +void blkif_disconnect(blkif_t *blkif);
50828 +void blkif_free(blkif_t *blkif);
50829 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
50830 +
50831 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
50832 +#define blkif_put(_b)                                  \
50833 +       do {                                            \
50834 +               if (atomic_dec_and_test(&(_b)->refcnt)) \
50835 +                       wake_up(&(_b)->waiting_to_free);\
50836 +       } while (0)
50837 +
50838 +/* Create a vbd. */
50839 +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
50840 +              unsigned minor, int readonly);
50841 +void vbd_free(struct vbd *vbd);
50842 +
50843 +unsigned long long vbd_size(struct vbd *vbd);
50844 +unsigned int vbd_info(struct vbd *vbd);
50845 +unsigned long vbd_secsize(struct vbd *vbd);
50846 +
50847 +struct phys_req {
50848 +       unsigned short       dev;
50849 +       unsigned short       nr_sects;
50850 +       struct block_device *bdev;
50851 +       blkif_sector_t       sector_number;
50852 +};
50853 +
50854 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
50855 +
50856 +void blkif_interface_init(void);
50857 +
50858 +void blkif_xenbus_init(void);
50859 +
50860 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
50861 +int blkif_schedule(void *arg);
50862 +
50863 +int blkback_barrier(struct xenbus_transaction xbt,
50864 +                   struct backend_info *be, int state);
50865 +
50866 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
50867 diff -ruNp linux-2.6.19/drivers/xen/blkback/interface.c linux-2.6.19-xen-3.0.4/drivers/xen/blkback/interface.c
50868 --- linux-2.6.19/drivers/xen/blkback/interface.c        1970-01-01 00:00:00.000000000 +0000
50869 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkback/interface.c      2007-02-02 19:10:45.000000000 +0000
50870 @@ -0,0 +1,171 @@
50871 +/******************************************************************************
50872 + * arch/xen/drivers/blkif/backend/interface.c
50873 + * 
50874 + * Block-device interface management.
50875 + * 
50876 + * Copyright (c) 2004, Keir Fraser
50877 + * 
50878 + * This program is free software; you can redistribute it and/or
50879 + * modify it under the terms of the GNU General Public License version 2
50880 + * as published by the Free Software Foundation; or, when distributed
50881 + * separately from the Linux kernel or incorporated into other
50882 + * software packages, subject to the following license:
50883 + * 
50884 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50885 + * of this source file (the "Software"), to deal in the Software without
50886 + * restriction, including without limitation the rights to use, copy, modify,
50887 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50888 + * and to permit persons to whom the Software is furnished to do so, subject to
50889 + * the following conditions:
50890 + * 
50891 + * The above copyright notice and this permission notice shall be included in
50892 + * all copies or substantial portions of the Software.
50893 + * 
50894 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50895 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50896 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50897 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50898 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50899 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50900 + * IN THE SOFTWARE.
50901 + */
50902 +
50903 +#include "common.h"
50904 +#include <xen/evtchn.h>
50905 +#include <linux/kthread.h>
50906 +
50907 +static kmem_cache_t *blkif_cachep;
50908 +
50909 +blkif_t *blkif_alloc(domid_t domid)
50910 +{
50911 +       blkif_t *blkif;
50912 +
50913 +       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
50914 +       if (!blkif)
50915 +               return ERR_PTR(-ENOMEM);
50916 +
50917 +       memset(blkif, 0, sizeof(*blkif));
50918 +       blkif->domid = domid;
50919 +       spin_lock_init(&blkif->blk_ring_lock);
50920 +       atomic_set(&blkif->refcnt, 1);
50921 +       init_waitqueue_head(&blkif->wq);
50922 +       blkif->st_print = jiffies;
50923 +       init_waitqueue_head(&blkif->waiting_to_free);
50924 +
50925 +       return blkif;
50926 +}
50927 +
50928 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
50929 +{
50930 +       struct gnttab_map_grant_ref op;
50931 +       int ret;
50932 +
50933 +       gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
50934 +                         GNTMAP_host_map, shared_page, blkif->domid);
50935 +
50936 +       lock_vm_area(blkif->blk_ring_area);
50937 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
50938 +       unlock_vm_area(blkif->blk_ring_area);
50939 +       BUG_ON(ret);
50940 +
50941 +       if (op.status) {
50942 +               DPRINTK(" Grant table operation failure !\n");
50943 +               return op.status;
50944 +       }
50945 +
50946 +       blkif->shmem_ref = shared_page;
50947 +       blkif->shmem_handle = op.handle;
50948 +
50949 +       return 0;
50950 +}
50951 +
50952 +static void unmap_frontend_page(blkif_t *blkif)
50953 +{
50954 +       struct gnttab_unmap_grant_ref op;
50955 +       int ret;
50956 +
50957 +       gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
50958 +                           GNTMAP_host_map, blkif->shmem_handle);
50959 +
50960 +       lock_vm_area(blkif->blk_ring_area);
50961 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
50962 +       unlock_vm_area(blkif->blk_ring_area);
50963 +       BUG_ON(ret);
50964 +}
50965 +
50966 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
50967 +{
50968 +       blkif_sring_t *sring;
50969 +       int err;
50970 +       struct evtchn_bind_interdomain bind_interdomain;
50971 +
50972 +       /* Already connected through? */
50973 +       if (blkif->irq)
50974 +               return 0;
50975 +
50976 +       if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
50977 +               return -ENOMEM;
50978 +
50979 +       err = map_frontend_page(blkif, shared_page);
50980 +       if (err) {
50981 +               free_vm_area(blkif->blk_ring_area);
50982 +               return err;
50983 +       }
50984 +
50985 +       bind_interdomain.remote_dom  = blkif->domid;
50986 +       bind_interdomain.remote_port = evtchn;
50987 +
50988 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
50989 +                                         &bind_interdomain);
50990 +       if (err) {
50991 +               unmap_frontend_page(blkif);
50992 +               free_vm_area(blkif->blk_ring_area);
50993 +               return err;
50994 +       }
50995 +
50996 +       blkif->evtchn = bind_interdomain.local_port;
50997 +
50998 +       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
50999 +       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
51000 +
51001 +       blkif->irq = bind_evtchn_to_irqhandler(
51002 +               blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
51003 +
51004 +       return 0;
51005 +}
51006 +
51007 +void blkif_disconnect(blkif_t *blkif)
51008 +{
51009 +       if (blkif->xenblkd) {
51010 +               kthread_stop(blkif->xenblkd);
51011 +               blkif->xenblkd = NULL;
51012 +       }
51013 +
51014 +       atomic_dec(&blkif->refcnt);
51015 +       wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
51016 +       atomic_inc(&blkif->refcnt);
51017 +
51018 +       if (blkif->irq) {
51019 +               unbind_from_irqhandler(blkif->irq, blkif);
51020 +               blkif->irq = 0;
51021 +       }
51022 +
51023 +       if (blkif->blk_ring.sring) {
51024 +               unmap_frontend_page(blkif);
51025 +               free_vm_area(blkif->blk_ring_area);
51026 +               blkif->blk_ring.sring = NULL;
51027 +       }
51028 +}
51029 +
51030 +void blkif_free(blkif_t *blkif)
51031 +{
51032 +       if (!atomic_dec_and_test(&blkif->refcnt))
51033 +               BUG();
51034 +       kmem_cache_free(blkif_cachep, blkif);
51035 +}
51036 +
51037 +void __init blkif_interface_init(void)
51038 +{
51039 +       blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
51040 +                                        0, 0, NULL, NULL);
51041 +}
51042 diff -ruNp linux-2.6.19/drivers/xen/blkback/vbd.c linux-2.6.19-xen-3.0.4/drivers/xen/blkback/vbd.c
51043 --- linux-2.6.19/drivers/xen/blkback/vbd.c      1970-01-01 00:00:00.000000000 +0000
51044 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkback/vbd.c    2007-02-02 19:10:45.000000000 +0000
51045 @@ -0,0 +1,118 @@
51046 +/******************************************************************************
51047 + * blkback/vbd.c
51048 + * 
51049 + * Routines for managing virtual block devices (VBDs).
51050 + * 
51051 + * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
51052 + * 
51053 + * This program is free software; you can redistribute it and/or
51054 + * modify it under the terms of the GNU General Public License version 2
51055 + * as published by the Free Software Foundation; or, when distributed
51056 + * separately from the Linux kernel or incorporated into other
51057 + * software packages, subject to the following license:
51058 + * 
51059 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51060 + * of this source file (the "Software"), to deal in the Software without
51061 + * restriction, including without limitation the rights to use, copy, modify,
51062 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51063 + * and to permit persons to whom the Software is furnished to do so, subject to
51064 + * the following conditions:
51065 + * 
51066 + * The above copyright notice and this permission notice shall be included in
51067 + * all copies or substantial portions of the Software.
51068 + * 
51069 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51070 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51071 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51072 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51073 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51074 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51075 + * IN THE SOFTWARE.
51076 + */
51077 +
51078 +#include "common.h"
51079 +
51080 +#define vbd_sz(_v)   ((_v)->bdev->bd_part ?                            \
51081 +       (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
51082 +
51083 +unsigned long long vbd_size(struct vbd *vbd)
51084 +{
51085 +       return vbd_sz(vbd);
51086 +}
51087 +
51088 +unsigned int vbd_info(struct vbd *vbd)
51089 +{
51090 +       return vbd->type | (vbd->readonly?VDISK_READONLY:0);
51091 +}
51092 +
51093 +unsigned long vbd_secsize(struct vbd *vbd)
51094 +{
51095 +       return bdev_hardsect_size(vbd->bdev);
51096 +}
51097 +
51098 +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
51099 +              unsigned minor, int readonly)
51100 +{
51101 +       struct vbd *vbd;
51102 +       struct block_device *bdev;
51103 +
51104 +       vbd = &blkif->vbd;
51105 +       vbd->handle   = handle; 
51106 +       vbd->readonly = readonly;
51107 +       vbd->type     = 0;
51108 +
51109 +       vbd->pdevice  = MKDEV(major, minor);
51110 +
51111 +       bdev = open_by_devnum(vbd->pdevice,
51112 +                             vbd->readonly ? FMODE_READ : FMODE_WRITE);
51113 +
51114 +       if (IS_ERR(bdev)) {
51115 +               DPRINTK("vbd_creat: device %08x could not be opened.\n",
51116 +                       vbd->pdevice);
51117 +               return -ENOENT;
51118 +       }
51119 +
51120 +       vbd->bdev = bdev;
51121 +
51122 +       if (vbd->bdev->bd_disk == NULL) {
51123 +               DPRINTK("vbd_creat: device %08x doesn't exist.\n",
51124 +                       vbd->pdevice);
51125 +               vbd_free(vbd);
51126 +               return -ENOENT;
51127 +       }
51128 +
51129 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_CD)
51130 +               vbd->type |= VDISK_CDROM;
51131 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
51132 +               vbd->type |= VDISK_REMOVABLE;
51133 +
51134 +       DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
51135 +               handle, blkif->domid);
51136 +       return 0;
51137 +}
51138 +
51139 +void vbd_free(struct vbd *vbd)
51140 +{
51141 +       if (vbd->bdev)
51142 +               blkdev_put(vbd->bdev);
51143 +       vbd->bdev = NULL;
51144 +}
51145 +
51146 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
51147 +{
51148 +       struct vbd *vbd = &blkif->vbd;
51149 +       int rc = -EACCES;
51150 +
51151 +       if ((operation != READ) && vbd->readonly)
51152 +               goto out;
51153 +
51154 +       if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
51155 +               goto out;
51156 +
51157 +       req->dev  = vbd->pdevice;
51158 +       req->bdev = vbd->bdev;
51159 +       rc = 0;
51160 +
51161 + out:
51162 +       return rc;
51163 +}
51164 diff -ruNp linux-2.6.19/drivers/xen/blkback/xenbus.c linux-2.6.19-xen-3.0.4/drivers/xen/blkback/xenbus.c
51165 --- linux-2.6.19/drivers/xen/blkback/xenbus.c   1970-01-01 00:00:00.000000000 +0000
51166 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkback/xenbus.c 2007-02-02 19:10:45.000000000 +0000
51167 @@ -0,0 +1,485 @@
51168 +/*  Xenbus code for blkif backend
51169 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
51170 +    Copyright (C) 2005 XenSource Ltd
51171 +
51172 +    This program is free software; you can redistribute it and/or modify
51173 +    it under the terms of the GNU General Public License as published by
51174 +    the Free Software Foundation; either version 2 of the License, or
51175 +    (at your option) any later version.
51176 +
51177 +    This program is distributed in the hope that it will be useful,
51178 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
51179 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
51180 +    GNU General Public License for more details.
51181 +
51182 +    You should have received a copy of the GNU General Public License
51183 +    along with this program; if not, write to the Free Software
51184 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
51185 +*/
51186 +
51187 +#include <stdarg.h>
51188 +#include <linux/module.h>
51189 +#include <linux/kthread.h>
51190 +#include "common.h"
51191 +
51192 +#undef DPRINTK
51193 +#define DPRINTK(fmt, args...)                          \
51194 +       pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
51195 +                __FUNCTION__, __LINE__, ##args)
51196 +
51197 +struct backend_info
51198 +{
51199 +       struct xenbus_device *dev;
51200 +       blkif_t *blkif;
51201 +       struct xenbus_watch backend_watch;
51202 +       unsigned major;
51203 +       unsigned minor;
51204 +       char *mode;
51205 +};
51206 +
51207 +static void connect(struct backend_info *);
51208 +static int connect_ring(struct backend_info *);
51209 +static void backend_changed(struct xenbus_watch *, const char **,
51210 +                           unsigned int);
51211 +
51212 +static void update_blkif_status(blkif_t *blkif)
51213 +{ 
51214 +       int err;
51215 +
51216 +       /* Not ready to connect? */
51217 +       if (!blkif->irq || !blkif->vbd.bdev)
51218 +               return;
51219 +
51220 +       /* Already connected? */
51221 +       if (blkif->be->dev->state == XenbusStateConnected)
51222 +               return;
51223 +
51224 +       /* Attempt to connect: exit if we fail to. */
51225 +       connect(blkif->be);
51226 +       if (blkif->be->dev->state != XenbusStateConnected)
51227 +               return;
51228 +
51229 +       blkif->xenblkd = kthread_run(blkif_schedule, blkif,
51230 +                                    "xvd %d %02x:%02x",
51231 +                                    blkif->domid,
51232 +                                    blkif->be->major, blkif->be->minor);
51233 +       if (IS_ERR(blkif->xenblkd)) {
51234 +               err = PTR_ERR(blkif->xenblkd);
51235 +               blkif->xenblkd = NULL;
51236 +               xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
51237 +       }
51238 +}
51239 +
51240 +
51241 +/****************************************************************
51242 + *  sysfs interface for VBD I/O requests
51243 + */
51244 +
51245 +#define VBD_SHOW(name, format, args...)                                        \
51246 +       static ssize_t show_##name(struct device *_dev,                 \
51247 +                                  struct device_attribute *attr,       \
51248 +                                  char *buf)                           \
51249 +       {                                                               \
51250 +               struct xenbus_device *dev = to_xenbus_device(_dev);     \
51251 +               struct backend_info *be = dev->dev.driver_data;         \
51252 +                                                                       \
51253 +               return sprintf(buf, format, ##args);                    \
51254 +       }                                                               \
51255 +       DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
51256 +
51257 +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
51258 +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
51259 +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
51260 +VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
51261 +
51262 +static struct attribute *vbdstat_attrs[] = {
51263 +       &dev_attr_oo_req.attr,
51264 +       &dev_attr_rd_req.attr,
51265 +       &dev_attr_wr_req.attr,
51266 +       &dev_attr_br_req.attr,
51267 +       NULL
51268 +};
51269 +
51270 +static struct attribute_group vbdstat_group = {
51271 +       .name = "statistics",
51272 +       .attrs = vbdstat_attrs,
51273 +};
51274 +
51275 +VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
51276 +VBD_SHOW(mode, "%s\n", be->mode);
51277 +
51278 +int xenvbd_sysfs_addif(struct xenbus_device *dev)
51279 +{
51280 +       int error;
51281 +       
51282 +       error = device_create_file(&dev->dev, &dev_attr_physical_device);
51283 +       if (error)
51284 +               goto fail1;
51285 +
51286 +       error = device_create_file(&dev->dev, &dev_attr_mode);
51287 +       if (error)
51288 +               goto fail2;
51289 +
51290 +       error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
51291 +       if (error)
51292 +               goto fail3;
51293 +
51294 +       return 0;
51295 +
51296 +fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
51297 +fail2: device_remove_file(&dev->dev, &dev_attr_mode);
51298 +fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
51299 +       return error;
51300 +}
51301 +
51302 +void xenvbd_sysfs_delif(struct xenbus_device *dev)
51303 +{
51304 +       sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
51305 +       device_remove_file(&dev->dev, &dev_attr_mode);
51306 +       device_remove_file(&dev->dev, &dev_attr_physical_device);
51307 +}
51308 +
51309 +static int blkback_remove(struct xenbus_device *dev)
51310 +{
51311 +       struct backend_info *be = dev->dev.driver_data;
51312 +
51313 +       DPRINTK("");
51314 +
51315 +       if (be->backend_watch.node) {
51316 +               unregister_xenbus_watch(&be->backend_watch);
51317 +               kfree(be->backend_watch.node);
51318 +               be->backend_watch.node = NULL;
51319 +       }
51320 +
51321 +       if (be->blkif) {
51322 +               blkif_disconnect(be->blkif);
51323 +               vbd_free(&be->blkif->vbd);
51324 +               blkif_free(be->blkif);
51325 +               be->blkif = NULL;
51326 +       }
51327 +
51328 +       if (be->major || be->minor)
51329 +               xenvbd_sysfs_delif(dev);
51330 +
51331 +       kfree(be);
51332 +       dev->dev.driver_data = NULL;
51333 +       return 0;
51334 +}
51335 +
51336 +int blkback_barrier(struct xenbus_transaction xbt,
51337 +                   struct backend_info *be, int state)
51338 +{
51339 +       struct xenbus_device *dev = be->dev;
51340 +       int err;
51341 +
51342 +       err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
51343 +                           "%d", state);
51344 +       if (err)
51345 +               xenbus_dev_fatal(dev, err, "writing feature-barrier");
51346 +
51347 +       return err;
51348 +}
51349 +
51350 +/**
51351 + * Entry point to this code when a new device is created.  Allocate the basic
51352 + * structures, and watch the store waiting for the hotplug scripts to tell us
51353 + * the device's physical major and minor numbers.  Switch to InitWait.
51354 + */
51355 +static int blkback_probe(struct xenbus_device *dev,
51356 +                        const struct xenbus_device_id *id)
51357 +{
51358 +       int err;
51359 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
51360 +                                         GFP_KERNEL);
51361 +       if (!be) {
51362 +               xenbus_dev_fatal(dev, -ENOMEM,
51363 +                                "allocating backend structure");
51364 +               return -ENOMEM;
51365 +       }
51366 +       be->dev = dev;
51367 +       dev->dev.driver_data = be;
51368 +
51369 +       be->blkif = blkif_alloc(dev->otherend_id);
51370 +       if (IS_ERR(be->blkif)) {
51371 +               err = PTR_ERR(be->blkif);
51372 +               be->blkif = NULL;
51373 +               xenbus_dev_fatal(dev, err, "creating block interface");
51374 +               goto fail;
51375 +       }
51376 +
51377 +       /* setup back pointer */
51378 +       be->blkif->be = be;
51379 +
51380 +       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
51381 +                                &be->backend_watch, backend_changed);
51382 +       if (err)
51383 +               goto fail;
51384 +
51385 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
51386 +       if (err)
51387 +               goto fail;
51388 +
51389 +       return 0;
51390 +
51391 +fail:
51392 +       DPRINTK("failed");
51393 +       blkback_remove(dev);
51394 +       return err;
51395 +}
51396 +
51397 +
51398 +/**
51399 + * Callback received when the hotplug scripts have placed the physical-device
51400 + * node.  Read it and the mode node, and create a vbd.  If the frontend is
51401 + * ready, connect.
51402 + */
51403 +static void backend_changed(struct xenbus_watch *watch,
51404 +                           const char **vec, unsigned int len)
51405 +{
51406 +       int err;
51407 +       unsigned major;
51408 +       unsigned minor;
51409 +       struct backend_info *be
51410 +               = container_of(watch, struct backend_info, backend_watch);
51411 +       struct xenbus_device *dev = be->dev;
51412 +
51413 +       DPRINTK("");
51414 +
51415 +       err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
51416 +                          &major, &minor);
51417 +       if (XENBUS_EXIST_ERR(err)) {
51418 +               /* Since this watch will fire once immediately after it is
51419 +                  registered, we expect this.  Ignore it, and wait for the
51420 +                  hotplug scripts. */
51421 +               return;
51422 +       }
51423 +       if (err != 2) {
51424 +               xenbus_dev_fatal(dev, err, "reading physical-device");
51425 +               return;
51426 +       }
51427 +
51428 +       if ((be->major || be->minor) &&
51429 +           ((be->major != major) || (be->minor != minor))) {
51430 +               printk(KERN_WARNING
51431 +                      "blkback: changing physical device (from %x:%x to "
51432 +                      "%x:%x) not supported.\n", be->major, be->minor,
51433 +                      major, minor);
51434 +               return;
51435 +       }
51436 +
51437 +       be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
51438 +       if (IS_ERR(be->mode)) {
51439 +               err = PTR_ERR(be->mode);
51440 +               be->mode = NULL;
51441 +               xenbus_dev_fatal(dev, err, "reading mode");
51442 +               return;
51443 +       }
51444 +
51445 +       if (be->major == 0 && be->minor == 0) {
51446 +               /* Front end dir is a number, which is used as the handle. */
51447 +
51448 +               char *p = strrchr(dev->otherend, '/') + 1;
51449 +               long handle = simple_strtoul(p, NULL, 0);
51450 +
51451 +               be->major = major;
51452 +               be->minor = minor;
51453 +
51454 +               err = vbd_create(be->blkif, handle, major, minor,
51455 +                                (NULL == strchr(be->mode, 'w')));
51456 +               if (err) {
51457 +                       be->major = be->minor = 0;
51458 +                       xenbus_dev_fatal(dev, err, "creating vbd structure");
51459 +                       return;
51460 +               }
51461 +
51462 +               err = xenvbd_sysfs_addif(dev);
51463 +               if (err) {
51464 +                       vbd_free(&be->blkif->vbd);
51465 +                       be->major = be->minor = 0;
51466 +                       xenbus_dev_fatal(dev, err, "creating sysfs entries");
51467 +                       return;
51468 +               }
51469 +
51470 +               /* We're potentially connected now */
51471 +               update_blkif_status(be->blkif);
51472 +       }
51473 +}
51474 +
51475 +
51476 +/**
51477 + * Callback received when the frontend's state changes.
51478 + */
51479 +static void frontend_changed(struct xenbus_device *dev,
51480 +                            enum xenbus_state frontend_state)
51481 +{
51482 +       struct backend_info *be = dev->dev.driver_data;
51483 +       int err;
51484 +
51485 +       DPRINTK("%s", xenbus_strstate(frontend_state));
51486 +
51487 +       switch (frontend_state) {
51488 +       case XenbusStateInitialising:
51489 +               if (dev->state == XenbusStateClosed) {
51490 +                       printk("%s: %s: prepare for reconnect\n",
51491 +                              __FUNCTION__, dev->nodename);
51492 +                       xenbus_switch_state(dev, XenbusStateInitWait);
51493 +               }
51494 +               break;
51495 +
51496 +       case XenbusStateInitialised:
51497 +       case XenbusStateConnected:
51498 +               /* Ensure we connect even when two watches fire in 
51499 +                  close successsion and we miss the intermediate value 
51500 +                  of frontend_state. */
51501 +               if (dev->state == XenbusStateConnected)
51502 +                       break;
51503 +
51504 +               err = connect_ring(be);
51505 +               if (err)
51506 +                       break;
51507 +               update_blkif_status(be->blkif);
51508 +               break;
51509 +
51510 +       case XenbusStateClosing:
51511 +               blkif_disconnect(be->blkif);
51512 +               xenbus_switch_state(dev, XenbusStateClosing);
51513 +               break;
51514 +
51515 +       case XenbusStateClosed:
51516 +               xenbus_switch_state(dev, XenbusStateClosed);
51517 +               if (xenbus_dev_is_online(dev))
51518 +                       break;
51519 +               /* fall through if not online */
51520 +       case XenbusStateUnknown:
51521 +               device_unregister(&dev->dev);
51522 +               break;
51523 +
51524 +       default:
51525 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
51526 +                                frontend_state);
51527 +               break;
51528 +       }
51529 +}
51530 +
51531 +
51532 +/* ** Connection ** */
51533 +
51534 +
51535 +/**
51536 + * Write the physical details regarding the block device to the store, and
51537 + * switch to Connected state.
51538 + */
51539 +static void connect(struct backend_info *be)
51540 +{
51541 +       struct xenbus_transaction xbt;
51542 +       int err;
51543 +       struct xenbus_device *dev = be->dev;
51544 +
51545 +       DPRINTK("%s", dev->otherend);
51546 +
51547 +       /* Supply the information about the device the frontend needs */
51548 +again:
51549 +       err = xenbus_transaction_start(&xbt);
51550 +       if (err) {
51551 +               xenbus_dev_fatal(dev, err, "starting transaction");
51552 +               return;
51553 +       }
51554 +
51555 +       err = blkback_barrier(xbt, be, 1);
51556 +       if (err)
51557 +               goto abort;
51558 +
51559 +       err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
51560 +                           vbd_size(&be->blkif->vbd));
51561 +       if (err) {
51562 +               xenbus_dev_fatal(dev, err, "writing %s/sectors",
51563 +                                dev->nodename);
51564 +               goto abort;
51565 +       }
51566 +
51567 +       /* FIXME: use a typename instead */
51568 +       err = xenbus_printf(xbt, dev->nodename, "info", "%u",
51569 +                           vbd_info(&be->blkif->vbd));
51570 +       if (err) {
51571 +               xenbus_dev_fatal(dev, err, "writing %s/info",
51572 +                                dev->nodename);
51573 +               goto abort;
51574 +       }
51575 +       err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
51576 +                           vbd_secsize(&be->blkif->vbd));
51577 +       if (err) {
51578 +               xenbus_dev_fatal(dev, err, "writing %s/sector-size",
51579 +                                dev->nodename);
51580 +               goto abort;
51581 +       }
51582 +
51583 +       err = xenbus_transaction_end(xbt, 0);
51584 +       if (err == -EAGAIN)
51585 +               goto again;
51586 +       if (err)
51587 +               xenbus_dev_fatal(dev, err, "ending transaction");
51588 +
51589 +       err = xenbus_switch_state(dev, XenbusStateConnected);
51590 +       if (err)
51591 +               xenbus_dev_fatal(dev, err, "switching to Connected state",
51592 +                                dev->nodename);
51593 +
51594 +       return;
51595 + abort:
51596 +       xenbus_transaction_end(xbt, 1);
51597 +}
51598 +
51599 +
51600 +static int connect_ring(struct backend_info *be)
51601 +{
51602 +       struct xenbus_device *dev = be->dev;
51603 +       unsigned long ring_ref;
51604 +       unsigned int evtchn;
51605 +       int err;
51606 +
51607 +       DPRINTK("%s", dev->otherend);
51608 +
51609 +       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
51610 +                           "event-channel", "%u", &evtchn, NULL);
51611 +       if (err) {
51612 +               xenbus_dev_fatal(dev, err,
51613 +                                "reading %s/ring-ref and event-channel",
51614 +                                dev->otherend);
51615 +               return err;
51616 +       }
51617 +
51618 +       /* Map the shared frame, irq etc. */
51619 +       err = blkif_map(be->blkif, ring_ref, evtchn);
51620 +       if (err) {
51621 +               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
51622 +                                ring_ref, evtchn);
51623 +               return err;
51624 +       }
51625 +
51626 +       return 0;
51627 +}
51628 +
51629 +
51630 +/* ** Driver Registration ** */
51631 +
51632 +
51633 +static struct xenbus_device_id blkback_ids[] = {
51634 +       { "vbd" },
51635 +       { "" }
51636 +};
51637 +
51638 +
51639 +static struct xenbus_driver blkback = {
51640 +       .name = "vbd",
51641 +       .owner = THIS_MODULE,
51642 +       .ids = blkback_ids,
51643 +       .probe = blkback_probe,
51644 +       .remove = blkback_remove,
51645 +       .otherend_changed = frontend_changed
51646 +};
51647 +
51648 +
51649 +void blkif_xenbus_init(void)
51650 +{
51651 +       xenbus_register_backend(&blkback);
51652 +}
51653 diff -ruNp linux-2.6.19/drivers/xen/blkfront/Kconfig linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/Kconfig
51654 --- linux-2.6.19/drivers/xen/blkfront/Kconfig   1970-01-01 00:00:00.000000000 +0000
51655 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/Kconfig 2007-02-02 19:10:45.000000000 +0000
51656 @@ -0,0 +1,6 @@
51657 +
51658 +config XENBLOCK
51659 +       tristate "Block device driver"
51660 +       depends on ARCH_XEN
51661 +       help
51662 +         Block device driver for Xen
51663 diff -ruNp linux-2.6.19/drivers/xen/blkfront/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/Makefile
51664 --- linux-2.6.19/drivers/xen/blkfront/Makefile  1970-01-01 00:00:00.000000000 +0000
51665 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/Makefile        2007-02-02 19:10:45.000000000 +0000
51666 @@ -0,0 +1,5 @@
51667 +
51668 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      := xenblk.o
51669 +
51670 +xenblk-objs := blkfront.o vbd.o
51671 +
51672 diff -ruNp linux-2.6.19/drivers/xen/blkfront/blkfront.c linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/blkfront.c
51673 --- linux-2.6.19/drivers/xen/blkfront/blkfront.c        1970-01-01 00:00:00.000000000 +0000
51674 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/blkfront.c      2007-02-02 19:10:45.000000000 +0000
51675 @@ -0,0 +1,891 @@
51676 +/******************************************************************************
51677 + * blkfront.c
51678 + * 
51679 + * XenLinux virtual block-device driver.
51680 + * 
51681 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
51682 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
51683 + * Copyright (c) 2004, Christian Limpach
51684 + * Copyright (c) 2004, Andrew Warfield
51685 + * Copyright (c) 2005, Christopher Clark
51686 + * Copyright (c) 2005, XenSource Ltd
51687 + * 
51688 + * This program is free software; you can redistribute it and/or
51689 + * modify it under the terms of the GNU General Public License version 2
51690 + * as published by the Free Software Foundation; or, when distributed
51691 + * separately from the Linux kernel or incorporated into other
51692 + * software packages, subject to the following license:
51693 + * 
51694 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51695 + * of this source file (the "Software"), to deal in the Software without
51696 + * restriction, including without limitation the rights to use, copy, modify,
51697 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51698 + * and to permit persons to whom the Software is furnished to do so, subject to
51699 + * the following conditions:
51700 + * 
51701 + * The above copyright notice and this permission notice shall be included in
51702 + * all copies or substantial portions of the Software.
51703 + * 
51704 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51705 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51706 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51707 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51708 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51709 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51710 + * IN THE SOFTWARE.
51711 + */
51712 +
51713 +#include <linux/version.h>
51714 +#include "block.h"
51715 +#include <linux/cdrom.h>
51716 +#include <linux/sched.h>
51717 +#include <linux/interrupt.h>
51718 +#include <scsi/scsi.h>
51719 +#include <xen/evtchn.h>
51720 +#include <xen/xenbus.h>
51721 +#include <xen/interface/grant_table.h>
51722 +#include <xen/gnttab.h>
51723 +#include <asm/hypervisor.h>
51724 +#include <asm/maddr.h>
51725 +
51726 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
51727 +#include <xen/platform-compat.h>
51728 +#endif
51729 +
51730 +#define BLKIF_STATE_DISCONNECTED 0
51731 +#define BLKIF_STATE_CONNECTED    1
51732 +#define BLKIF_STATE_SUSPENDED    2
51733 +
51734 +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
51735 +    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
51736 +#define GRANT_INVALID_REF      0
51737 +
51738 +static void connect(struct blkfront_info *);
51739 +static void blkfront_closing(struct xenbus_device *);
51740 +static int blkfront_remove(struct xenbus_device *);
51741 +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
51742 +static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
51743 +
51744 +static void kick_pending_request_queues(struct blkfront_info *);
51745 +
51746 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
51747 +static void blkif_restart_queue(void *arg);
51748 +static void blkif_recover(struct blkfront_info *);
51749 +static void blkif_completion(struct blk_shadow *);
51750 +static void blkif_free(struct blkfront_info *, int);
51751 +
51752 +
51753 +/**
51754 + * Entry point to this code when a new device is created.  Allocate the basic
51755 + * structures and the ring buffer for communication with the backend, and
51756 + * inform the backend of the appropriate details for those.  Switch to
51757 + * Initialised state.
51758 + */
51759 +static int blkfront_probe(struct xenbus_device *dev,
51760 +                         const struct xenbus_device_id *id)
51761 +{
51762 +       int err, vdevice, i;
51763 +       struct blkfront_info *info;
51764 +
51765 +       /* FIXME: Use dynamic device id if this is not set. */
51766 +       err = xenbus_scanf(XBT_NIL, dev->nodename,
51767 +                          "virtual-device", "%i", &vdevice);
51768 +       if (err != 1) {
51769 +               xenbus_dev_fatal(dev, err, "reading virtual-device");
51770 +               return err;
51771 +       }
51772 +
51773 +       info = kzalloc(sizeof(*info), GFP_KERNEL);
51774 +       if (!info) {
51775 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
51776 +               return -ENOMEM;
51777 +       }
51778 +
51779 +       info->xbdev = dev;
51780 +       info->vdevice = vdevice;
51781 +       info->connected = BLKIF_STATE_DISCONNECTED;
51782 +       INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
51783 +
51784 +       for (i = 0; i < BLK_RING_SIZE; i++)
51785 +               info->shadow[i].req.id = i+1;
51786 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
51787 +
51788 +       /* Front end dir is a number, which is used as the id. */
51789 +       info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
51790 +       dev->dev.driver_data = info;
51791 +
51792 +       err = talk_to_backend(dev, info);
51793 +       if (err) {
51794 +               kfree(info);
51795 +               dev->dev.driver_data = NULL;
51796 +               return err;
51797 +       }
51798 +
51799 +       return 0;
51800 +}
51801 +
51802 +
51803 +/**
51804 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
51805 + * driver restart.  We tear down our blkif structure and recreate it, but
51806 + * leave the device-layer structures intact so that this is transparent to the
51807 + * rest of the kernel.
51808 + */
51809 +static int blkfront_resume(struct xenbus_device *dev)
51810 +{
51811 +       struct blkfront_info *info = dev->dev.driver_data;
51812 +       int err;
51813 +
51814 +       DPRINTK("blkfront_resume: %s\n", dev->nodename);
51815 +
51816 +       blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
51817 +
51818 +       err = talk_to_backend(dev, info);
51819 +       if (info->connected == BLKIF_STATE_SUSPENDED && !err)
51820 +               blkif_recover(info);
51821 +
51822 +       return err;
51823 +}
51824 +
51825 +
51826 +/* Common code used when first setting up, and when resuming. */
51827 +static int talk_to_backend(struct xenbus_device *dev,
51828 +                          struct blkfront_info *info)
51829 +{
51830 +       const char *message = NULL;
51831 +       struct xenbus_transaction xbt;
51832 +       int err;
51833 +
51834 +       /* Create shared ring, alloc event channel. */
51835 +       err = setup_blkring(dev, info);
51836 +       if (err)
51837 +               goto out;
51838 +
51839 +again:
51840 +       err = xenbus_transaction_start(&xbt);
51841 +       if (err) {
51842 +               xenbus_dev_fatal(dev, err, "starting transaction");
51843 +               goto destroy_blkring;
51844 +       }
51845 +
51846 +       err = xenbus_printf(xbt, dev->nodename,
51847 +                           "ring-ref","%u", info->ring_ref);
51848 +       if (err) {
51849 +               message = "writing ring-ref";
51850 +               goto abort_transaction;
51851 +       }
51852 +       err = xenbus_printf(xbt, dev->nodename,
51853 +                           "event-channel", "%u", info->evtchn);
51854 +       if (err) {
51855 +               message = "writing event-channel";
51856 +               goto abort_transaction;
51857 +       }
51858 +
51859 +       err = xenbus_transaction_end(xbt, 0);
51860 +       if (err) {
51861 +               if (err == -EAGAIN)
51862 +                       goto again;
51863 +               xenbus_dev_fatal(dev, err, "completing transaction");
51864 +               goto destroy_blkring;
51865 +       }
51866 +
51867 +       xenbus_switch_state(dev, XenbusStateInitialised);
51868 +
51869 +       return 0;
51870 +
51871 + abort_transaction:
51872 +       xenbus_transaction_end(xbt, 1);
51873 +       if (message)
51874 +               xenbus_dev_fatal(dev, err, "%s", message);
51875 + destroy_blkring:
51876 +       blkif_free(info, 0);
51877 + out:
51878 +       return err;
51879 +}
51880 +
51881 +
51882 +static int setup_blkring(struct xenbus_device *dev,
51883 +                        struct blkfront_info *info)
51884 +{
51885 +       blkif_sring_t *sring;
51886 +       int err;
51887 +
51888 +       info->ring_ref = GRANT_INVALID_REF;
51889 +
51890 +       sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
51891 +       if (!sring) {
51892 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
51893 +               return -ENOMEM;
51894 +       }
51895 +       SHARED_RING_INIT(sring);
51896 +       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
51897 +
51898 +       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
51899 +       if (err < 0) {
51900 +               free_page((unsigned long)sring);
51901 +               info->ring.sring = NULL;
51902 +               goto fail;
51903 +       }
51904 +       info->ring_ref = err;
51905 +
51906 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
51907 +       if (err)
51908 +               goto fail;
51909 +
51910 +       err = bind_evtchn_to_irqhandler(
51911 +               info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
51912 +       if (err <= 0) {
51913 +               xenbus_dev_fatal(dev, err,
51914 +                                "bind_evtchn_to_irqhandler failed");
51915 +               goto fail;
51916 +       }
51917 +       info->irq = err;
51918 +
51919 +       return 0;
51920 +fail:
51921 +       blkif_free(info, 0);
51922 +       return err;
51923 +}
51924 +
51925 +
51926 +/**
51927 + * Callback received when the backend's state changes.
51928 + */
51929 +static void backend_changed(struct xenbus_device *dev,
51930 +                           enum xenbus_state backend_state)
51931 +{
51932 +       struct blkfront_info *info = dev->dev.driver_data;
51933 +       struct block_device *bd;
51934 +
51935 +       DPRINTK("blkfront:backend_changed.\n");
51936 +
51937 +       switch (backend_state) {
51938 +       case XenbusStateInitialising:
51939 +       case XenbusStateInitWait:
51940 +       case XenbusStateInitialised:
51941 +       case XenbusStateUnknown:
51942 +       case XenbusStateClosed:
51943 +               break;
51944 +
51945 +       case XenbusStateConnected:
51946 +               connect(info);
51947 +               break;
51948 +
51949 +       case XenbusStateClosing:
51950 +               bd = bdget(info->dev);
51951 +               if (bd == NULL)
51952 +                       xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
51953 +
51954 +               mutex_lock(&bd->bd_mutex);
51955 +               if (info->users > 0)
51956 +                       xenbus_dev_error(dev, -EBUSY,
51957 +                                        "Device in use; refusing to close");
51958 +               else
51959 +                       blkfront_closing(dev);
51960 +               mutex_unlock(&bd->bd_mutex);
51961 +               bdput(bd);
51962 +               break;
51963 +       }
51964 +}
51965 +
51966 +
51967 +/* ** Connection ** */
51968 +
51969 +
51970 +/*
51971 + * Invoked when the backend is finally 'ready' (and has told produced
51972 + * the details about the physical device - #sectors, size, etc).
51973 + */
51974 +static void connect(struct blkfront_info *info)
51975 +{
51976 +       unsigned long long sectors;
51977 +       unsigned long sector_size;
51978 +       unsigned int binfo;
51979 +       int err;
51980 +
51981 +       if ((info->connected == BLKIF_STATE_CONNECTED) ||
51982 +           (info->connected == BLKIF_STATE_SUSPENDED) )
51983 +               return;
51984 +
51985 +       DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
51986 +
51987 +       err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
51988 +                           "sectors", "%Lu", &sectors,
51989 +                           "info", "%u", &binfo,
51990 +                           "sector-size", "%lu", &sector_size,
51991 +                           NULL);
51992 +       if (err) {
51993 +               xenbus_dev_fatal(info->xbdev, err,
51994 +                                "reading backend fields at %s",
51995 +                                info->xbdev->otherend);
51996 +               return;
51997 +       }
51998 +
51999 +       err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
52000 +                           "feature-barrier", "%lu", &info->feature_barrier,
52001 +                           NULL);
52002 +       if (err)
52003 +               info->feature_barrier = 0;
52004 +
52005 +       err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
52006 +       if (err) {
52007 +               xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
52008 +                                info->xbdev->otherend);
52009 +               return;
52010 +       }
52011 +
52012 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
52013 +
52014 +       /* Kick pending requests. */
52015 +       spin_lock_irq(&blkif_io_lock);
52016 +       info->connected = BLKIF_STATE_CONNECTED;
52017 +       kick_pending_request_queues(info);
52018 +       spin_unlock_irq(&blkif_io_lock);
52019 +
52020 +       add_disk(info->gd);
52021 +}
52022 +
52023 +/**
52024 + * Handle the change of state of the backend to Closing.  We must delete our
52025 + * device-layer structures now, to ensure that writes are flushed through to
52026 + * the backend.  Once is this done, we can switch to Closed in
52027 + * acknowledgement.
52028 + */
52029 +static void blkfront_closing(struct xenbus_device *dev)
52030 +{
52031 +       struct blkfront_info *info = dev->dev.driver_data;
52032 +       unsigned long flags;
52033 +
52034 +       DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
52035 +
52036 +       if (info->rq == NULL)
52037 +               goto out;
52038 +
52039 +       spin_lock_irqsave(&blkif_io_lock, flags);
52040 +       /* No more blkif_request(). */
52041 +       blk_stop_queue(info->rq);
52042 +       /* No more gnttab callback work. */
52043 +       gnttab_cancel_free_callback(&info->callback);
52044 +       spin_unlock_irqrestore(&blkif_io_lock, flags);
52045 +
52046 +       /* Flush gnttab callback work. Must be done with no locks held. */
52047 +       flush_scheduled_work();
52048 +
52049 +       xlvbd_del(info);
52050 +
52051 + out:
52052 +       xenbus_frontend_closed(dev);
52053 +}
52054 +
52055 +
52056 +static int blkfront_remove(struct xenbus_device *dev)
52057 +{
52058 +       struct blkfront_info *info = dev->dev.driver_data;
52059 +
52060 +       DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
52061 +
52062 +       blkif_free(info, 0);
52063 +
52064 +       kfree(info);
52065 +
52066 +       return 0;
52067 +}
52068 +
52069 +
52070 +static inline int GET_ID_FROM_FREELIST(
52071 +       struct blkfront_info *info)
52072 +{
52073 +       unsigned long free = info->shadow_free;
52074 +       BUG_ON(free > BLK_RING_SIZE);
52075 +       info->shadow_free = info->shadow[free].req.id;
52076 +       info->shadow[free].req.id = 0x0fffffee; /* debug */
52077 +       return free;
52078 +}
52079 +
52080 +static inline void ADD_ID_TO_FREELIST(
52081 +       struct blkfront_info *info, unsigned long id)
52082 +{
52083 +       info->shadow[id].req.id  = info->shadow_free;
52084 +       info->shadow[id].request = 0;
52085 +       info->shadow_free = id;
52086 +}
52087 +
52088 +static inline void flush_requests(struct blkfront_info *info)
52089 +{
52090 +       int notify;
52091 +
52092 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
52093 +
52094 +       if (notify)
52095 +               notify_remote_via_irq(info->irq);
52096 +}
52097 +
52098 +static void kick_pending_request_queues(struct blkfront_info *info)
52099 +{
52100 +       if (!RING_FULL(&info->ring)) {
52101 +               /* Re-enable calldowns. */
52102 +               blk_start_queue(info->rq);
52103 +               /* Kick things off immediately. */
52104 +               do_blkif_request(info->rq);
52105 +       }
52106 +}
52107 +
52108 +static void blkif_restart_queue(void *arg)
52109 +{
52110 +       struct blkfront_info *info = (struct blkfront_info *)arg;
52111 +       spin_lock_irq(&blkif_io_lock);
52112 +       if (info->connected == BLKIF_STATE_CONNECTED)
52113 +               kick_pending_request_queues(info);
52114 +       spin_unlock_irq(&blkif_io_lock);
52115 +}
52116 +
52117 +static void blkif_restart_queue_callback(void *arg)
52118 +{
52119 +       struct blkfront_info *info = (struct blkfront_info *)arg;
52120 +       schedule_work(&info->work);
52121 +}
52122 +
52123 +int blkif_open(struct inode *inode, struct file *filep)
52124 +{
52125 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
52126 +       info->users++;
52127 +       return 0;
52128 +}
52129 +
52130 +
52131 +int blkif_release(struct inode *inode, struct file *filep)
52132 +{
52133 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
52134 +       info->users--;
52135 +       if (info->users == 0) {
52136 +               /* Check whether we have been instructed to close.  We will
52137 +                  have ignored this request initially, as the device was
52138 +                  still mounted. */
52139 +               struct xenbus_device * dev = info->xbdev;
52140 +               enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
52141 +
52142 +               if (state == XenbusStateClosing)
52143 +                       blkfront_closing(dev);
52144 +       }
52145 +       return 0;
52146 +}
52147 +
52148 +
52149 +int blkif_ioctl(struct inode *inode, struct file *filep,
52150 +               unsigned command, unsigned long argument)
52151 +{
52152 +       int i;
52153 +
52154 +       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
52155 +                     command, (long)argument, inode->i_rdev);
52156 +
52157 +       switch (command) {
52158 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
52159 +       case HDIO_GETGEO: {
52160 +               struct block_device *bd = inode->i_bdev;
52161 +               struct hd_geometry geo;
52162 +               int ret;
52163 +
52164 +                if (!argument)
52165 +                        return -EINVAL;
52166 +
52167 +               geo.start = get_start_sect(bd);
52168 +               ret = blkif_getgeo(bd, &geo);
52169 +               if (ret)
52170 +                       return ret;
52171 +
52172 +               if (copy_to_user((struct hd_geometry __user *)argument, &geo,
52173 +                                sizeof(geo)))
52174 +                        return -EFAULT;
52175 +
52176 +                return 0;
52177 +       }
52178 +#endif
52179 +       case CDROMMULTISESSION:
52180 +               DPRINTK("FIXME: support multisession CDs later\n");
52181 +               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
52182 +                       if (put_user(0, (char __user *)(argument + i)))
52183 +                               return -EFAULT;
52184 +               return 0;
52185 +
52186 +       default:
52187 +               /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
52188 +                 command);*/
52189 +               return -EINVAL; /* same return as native Linux */
52190 +       }
52191 +
52192 +       return 0;
52193 +}
52194 +
52195 +
52196 +int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
52197 +{
52198 +       /* We don't have real geometry info, but let's at least return
52199 +          values consistent with the size of the device */
52200 +       sector_t nsect = get_capacity(bd->bd_disk);
52201 +       sector_t cylinders = nsect;
52202 +
52203 +       hg->heads = 0xff;
52204 +       hg->sectors = 0x3f;
52205 +       sector_div(cylinders, hg->heads * hg->sectors);
52206 +       hg->cylinders = cylinders;
52207 +       if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
52208 +               hg->cylinders = 0xffff;
52209 +       return 0;
52210 +}
52211 +
52212 +
52213 +/*
52214 + * blkif_queue_request
52215 + *
52216 + * request block io
52217 + *
52218 + * id: for guest use only.
52219 + * operation: BLKIF_OP_{READ,WRITE,PROBE}
52220 + * buffer: buffer to read/write into. this should be a
52221 + *   virtual address in the guest os.
52222 + */
52223 +static int blkif_queue_request(struct request *req)
52224 +{
52225 +       struct blkfront_info *info = req->rq_disk->private_data;
52226 +       unsigned long buffer_mfn;
52227 +       blkif_request_t *ring_req;
52228 +       struct bio *bio;
52229 +       struct bio_vec *bvec;
52230 +       int idx;
52231 +       unsigned long id;
52232 +       unsigned int fsect, lsect;
52233 +       int ref;
52234 +       grant_ref_t gref_head;
52235 +
52236 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
52237 +               return 1;
52238 +
52239 +       if (gnttab_alloc_grant_references(
52240 +               BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
52241 +               gnttab_request_free_callback(
52242 +                       &info->callback,
52243 +                       blkif_restart_queue_callback,
52244 +                       info,
52245 +                       BLKIF_MAX_SEGMENTS_PER_REQUEST);
52246 +               return 1;
52247 +       }
52248 +
52249 +       /* Fill out a communications ring structure. */
52250 +       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
52251 +       id = GET_ID_FROM_FREELIST(info);
52252 +       info->shadow[id].request = (unsigned long)req;
52253 +
52254 +       ring_req->id = id;
52255 +       ring_req->sector_number = (blkif_sector_t)req->sector;
52256 +       ring_req->handle = info->handle;
52257 +
52258 +       ring_req->operation = rq_data_dir(req) ?
52259 +               BLKIF_OP_WRITE : BLKIF_OP_READ;
52260 +       if (blk_barrier_rq(req))
52261 +               ring_req->operation = BLKIF_OP_WRITE_BARRIER;
52262 +
52263 +       ring_req->nr_segments = 0;
52264 +       rq_for_each_bio (bio, req) {
52265 +               bio_for_each_segment (bvec, bio, idx) {
52266 +                       BUG_ON(ring_req->nr_segments
52267 +                              == BLKIF_MAX_SEGMENTS_PER_REQUEST);
52268 +                       buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
52269 +                       fsect = bvec->bv_offset >> 9;
52270 +                       lsect = fsect + (bvec->bv_len >> 9) - 1;
52271 +                       /* install a grant reference. */
52272 +                       ref = gnttab_claim_grant_reference(&gref_head);
52273 +                       BUG_ON(ref == -ENOSPC);
52274 +
52275 +                       gnttab_grant_foreign_access_ref(
52276 +                               ref,
52277 +                               info->xbdev->otherend_id,
52278 +                               buffer_mfn,
52279 +                               rq_data_dir(req) );
52280 +
52281 +                       info->shadow[id].frame[ring_req->nr_segments] =
52282 +                               mfn_to_pfn(buffer_mfn);
52283 +
52284 +                       ring_req->seg[ring_req->nr_segments] =
52285 +                               (struct blkif_request_segment) {
52286 +                                       .gref       = ref,
52287 +                                       .first_sect = fsect,
52288 +                                       .last_sect  = lsect };
52289 +
52290 +                       ring_req->nr_segments++;
52291 +               }
52292 +       }
52293 +
52294 +       info->ring.req_prod_pvt++;
52295 +
52296 +       /* Keep a private copy so we can reissue requests when recovering. */
52297 +       info->shadow[id].req = *ring_req;
52298 +
52299 +       gnttab_free_grant_references(gref_head);
52300 +
52301 +       return 0;
52302 +}
52303 +
52304 +/*
52305 + * do_blkif_request
52306 + *  read a block; request is in a request queue
52307 + */
52308 +void do_blkif_request(request_queue_t *rq)
52309 +{
52310 +       struct blkfront_info *info = NULL;
52311 +       struct request *req;
52312 +       int queued;
52313 +
52314 +       DPRINTK("Entered do_blkif_request\n");
52315 +
52316 +       queued = 0;
52317 +
52318 +       while ((req = elv_next_request(rq)) != NULL) {
52319 +               info = req->rq_disk->private_data;
52320 +               if (!blk_fs_request(req)) {
52321 +                       end_request(req, 0);
52322 +                       continue;
52323 +               }
52324 +
52325 +               if (RING_FULL(&info->ring))
52326 +                       goto wait;
52327 +
52328 +               DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
52329 +                       "(%u/%li) buffer:%p [%s]\n",
52330 +                       req, req->cmd, req->sector, req->current_nr_sectors,
52331 +                       req->nr_sectors, req->buffer,
52332 +                       rq_data_dir(req) ? "write" : "read");
52333 +
52334 +
52335 +               blkdev_dequeue_request(req);
52336 +               if (blkif_queue_request(req)) {
52337 +                       blk_requeue_request(rq, req);
52338 +               wait:
52339 +                       /* Avoid pointless unplugs. */
52340 +                       blk_stop_queue(rq);
52341 +                       break;
52342 +               }
52343 +
52344 +               queued++;
52345 +       }
52346 +
52347 +       if (queued != 0)
52348 +               flush_requests(info);
52349 +}
52350 +
52351 +
52352 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
52353 +{
52354 +       struct request *req;
52355 +       blkif_response_t *bret;
52356 +       RING_IDX i, rp;
52357 +       unsigned long flags;
52358 +       struct blkfront_info *info = (struct blkfront_info *)dev_id;
52359 +       int uptodate;
52360 +
52361 +       spin_lock_irqsave(&blkif_io_lock, flags);
52362 +
52363 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
52364 +               spin_unlock_irqrestore(&blkif_io_lock, flags);
52365 +               return IRQ_HANDLED;
52366 +       }
52367 +
52368 + again:
52369 +       rp = info->ring.sring->rsp_prod;
52370 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
52371 +
52372 +       for (i = info->ring.rsp_cons; i != rp; i++) {
52373 +               unsigned long id;
52374 +               int ret;
52375 +
52376 +               bret = RING_GET_RESPONSE(&info->ring, i);
52377 +               id   = bret->id;
52378 +               req  = (struct request *)info->shadow[id].request;
52379 +
52380 +               blkif_completion(&info->shadow[id]);
52381 +
52382 +               ADD_ID_TO_FREELIST(info, id);
52383 +
52384 +               uptodate = (bret->status == BLKIF_RSP_OKAY);
52385 +               switch (bret->operation) {
52386 +               case BLKIF_OP_WRITE_BARRIER:
52387 +                       if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
52388 +                               printk("blkfront: %s: write barrier op failed\n",
52389 +                                      info->gd->disk_name);
52390 +                               uptodate = -EOPNOTSUPP;
52391 +                               info->feature_barrier = 0;
52392 +                               xlvbd_barrier(info);
52393 +                       }
52394 +                       /* fall through */
52395 +               case BLKIF_OP_READ:
52396 +               case BLKIF_OP_WRITE:
52397 +                       if (unlikely(bret->status != BLKIF_RSP_OKAY))
52398 +                               DPRINTK("Bad return from blkdev data "
52399 +                                       "request: %x\n", bret->status);
52400 +
52401 +                       ret = end_that_request_first(req, uptodate,
52402 +                               req->hard_nr_sectors);
52403 +                       BUG_ON(ret);
52404 +                       end_that_request_last(req, uptodate);
52405 +                       break;
52406 +               default:
52407 +                       BUG();
52408 +               }
52409 +       }
52410 +
52411 +       info->ring.rsp_cons = i;
52412 +
52413 +       if (i != info->ring.req_prod_pvt) {
52414 +               int more_to_do;
52415 +               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
52416 +               if (more_to_do)
52417 +                       goto again;
52418 +       } else
52419 +               info->ring.sring->rsp_event = i + 1;
52420 +
52421 +       kick_pending_request_queues(info);
52422 +
52423 +       spin_unlock_irqrestore(&blkif_io_lock, flags);
52424 +
52425 +       return IRQ_HANDLED;
52426 +}
52427 +
52428 +static void blkif_free(struct blkfront_info *info, int suspend)
52429 +{
52430 +       /* Prevent new requests being issued until we fix things up. */
52431 +       spin_lock_irq(&blkif_io_lock);
52432 +       info->connected = suspend ?
52433 +               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
52434 +       /* No more blkif_request(). */
52435 +       if (info->rq)
52436 +               blk_stop_queue(info->rq);
52437 +       /* No more gnttab callback work. */
52438 +       gnttab_cancel_free_callback(&info->callback);
52439 +       spin_unlock_irq(&blkif_io_lock);
52440 +
52441 +       /* Flush gnttab callback work. Must be done with no locks held. */
52442 +       flush_scheduled_work();
52443 +
52444 +       /* Free resources associated with old device channel. */
52445 +       if (info->ring_ref != GRANT_INVALID_REF) {
52446 +               gnttab_end_foreign_access(info->ring_ref, 0,
52447 +                                         (unsigned long)info->ring.sring);
52448 +               info->ring_ref = GRANT_INVALID_REF;
52449 +               info->ring.sring = NULL;
52450 +       }
52451 +       if (info->irq)
52452 +               unbind_from_irqhandler(info->irq, info);
52453 +       info->evtchn = info->irq = 0;
52454 +
52455 +}
52456 +
52457 +static void blkif_completion(struct blk_shadow *s)
52458 +{
52459 +       int i;
52460 +       for (i = 0; i < s->req.nr_segments; i++)
52461 +               gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
52462 +}
52463 +
52464 +static void blkif_recover(struct blkfront_info *info)
52465 +{
52466 +       int i;
52467 +       blkif_request_t *req;
52468 +       struct blk_shadow *copy;
52469 +       int j;
52470 +
52471 +       /* Stage 1: Make a safe copy of the shadow state. */
52472 +       copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
52473 +       memcpy(copy, info->shadow, sizeof(info->shadow));
52474 +
52475 +       /* Stage 2: Set up free list. */
52476 +       memset(&info->shadow, 0, sizeof(info->shadow));
52477 +       for (i = 0; i < BLK_RING_SIZE; i++)
52478 +               info->shadow[i].req.id = i+1;
52479 +       info->shadow_free = info->ring.req_prod_pvt;
52480 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
52481 +
52482 +       /* Stage 3: Find pending requests and requeue them. */
52483 +       for (i = 0; i < BLK_RING_SIZE; i++) {
52484 +               /* Not in use? */
52485 +               if (copy[i].request == 0)
52486 +                       continue;
52487 +
52488 +               /* Grab a request slot and copy shadow state into it. */
52489 +               req = RING_GET_REQUEST(
52490 +                       &info->ring, info->ring.req_prod_pvt);
52491 +               *req = copy[i].req;
52492 +
52493 +               /* We get a new request id, and must reset the shadow state. */
52494 +               req->id = GET_ID_FROM_FREELIST(info);
52495 +               memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
52496 +
52497 +               /* Rewrite any grant references invalidated by susp/resume. */
52498 +               for (j = 0; j < req->nr_segments; j++)
52499 +                       gnttab_grant_foreign_access_ref(
52500 +                               req->seg[j].gref,
52501 +                               info->xbdev->otherend_id,
52502 +                               pfn_to_mfn(info->shadow[req->id].frame[j]),
52503 +                               rq_data_dir(
52504 +                                       (struct request *)
52505 +                                       info->shadow[req->id].request));
52506 +               info->shadow[req->id].req = *req;
52507 +
52508 +               info->ring.req_prod_pvt++;
52509 +       }
52510 +
52511 +       kfree(copy);
52512 +
52513 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
52514 +
52515 +       spin_lock_irq(&blkif_io_lock);
52516 +
52517 +       /* Now safe for us to use the shared ring */
52518 +       info->connected = BLKIF_STATE_CONNECTED;
52519 +
52520 +       /* Send off requeued requests */
52521 +       flush_requests(info);
52522 +
52523 +       /* Kick any other new requests queued since we resumed */
52524 +       kick_pending_request_queues(info);
52525 +
52526 +       spin_unlock_irq(&blkif_io_lock);
52527 +}
52528 +
52529 +
52530 +/* ** Driver Registration ** */
52531 +
52532 +
52533 +static struct xenbus_device_id blkfront_ids[] = {
52534 +       { "vbd" },
52535 +       { "" }
52536 +};
52537 +
52538 +
52539 +static struct xenbus_driver blkfront = {
52540 +       .name = "vbd",
52541 +       .owner = THIS_MODULE,
52542 +       .ids = blkfront_ids,
52543 +       .probe = blkfront_probe,
52544 +       .remove = blkfront_remove,
52545 +       .resume = blkfront_resume,
52546 +       .otherend_changed = backend_changed,
52547 +};
52548 +
52549 +
52550 +static int __init xlblk_init(void)
52551 +{
52552 +       if (!is_running_on_xen())
52553 +               return -ENODEV;
52554 +
52555 +       return xenbus_register_frontend(&blkfront);
52556 +}
52557 +module_init(xlblk_init);
52558 +
52559 +
52560 +static void xlblk_exit(void)
52561 +{
52562 +       return xenbus_unregister_driver(&blkfront);
52563 +}
52564 +module_exit(xlblk_exit);
52565 +
52566 +MODULE_LICENSE("Dual BSD/GPL");
52567 diff -ruNp linux-2.6.19/drivers/xen/blkfront/block.h linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/block.h
52568 --- linux-2.6.19/drivers/xen/blkfront/block.h   1970-01-01 00:00:00.000000000 +0000
52569 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/block.h 2007-02-02 19:10:45.000000000 +0000
52570 @@ -0,0 +1,156 @@
52571 +/******************************************************************************
52572 + * block.h
52573 + * 
52574 + * Shared definitions between all levels of XenLinux Virtual block devices.
52575 + * 
52576 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
52577 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
52578 + * Copyright (c) 2004-2005, Christian Limpach
52579 + * 
52580 + * This program is free software; you can redistribute it and/or
52581 + * modify it under the terms of the GNU General Public License version 2
52582 + * as published by the Free Software Foundation; or, when distributed
52583 + * separately from the Linux kernel or incorporated into other
52584 + * software packages, subject to the following license:
52585 + * 
52586 + * Permission is hereby granted, free of charge, to any person obtaining a copy
52587 + * of this source file (the "Software"), to deal in the Software without
52588 + * restriction, including without limitation the rights to use, copy, modify,
52589 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
52590 + * and to permit persons to whom the Software is furnished to do so, subject to
52591 + * the following conditions:
52592 + * 
52593 + * The above copyright notice and this permission notice shall be included in
52594 + * all copies or substantial portions of the Software.
52595 + * 
52596 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
52597 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
52598 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52599 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52600 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
52601 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
52602 + * IN THE SOFTWARE.
52603 + */
52604 +
52605 +#ifndef __XEN_DRIVERS_BLOCK_H__
52606 +#define __XEN_DRIVERS_BLOCK_H__
52607 +
52608 +#include <linux/version.h>
52609 +#include <linux/module.h>
52610 +#include <linux/kernel.h>
52611 +#include <linux/sched.h>
52612 +#include <linux/slab.h>
52613 +#include <linux/string.h>
52614 +#include <linux/errno.h>
52615 +#include <linux/fs.h>
52616 +#include <linux/hdreg.h>
52617 +#include <linux/blkdev.h>
52618 +#include <linux/major.h>
52619 +#include <asm/hypervisor.h>
52620 +#include <xen/xenbus.h>
52621 +#include <xen/gnttab.h>
52622 +#include <xen/interface/xen.h>
52623 +#include <xen/interface/io/blkif.h>
52624 +#include <xen/interface/io/ring.h>
52625 +#include <asm/io.h>
52626 +#include <asm/atomic.h>
52627 +#include <asm/uaccess.h>
52628 +
52629 +#if 1
52630 +#define IPRINTK(fmt, args...)                          \
52631 +       printk(KERN_INFO "xen_blk: " fmt, ##args)
52632 +#else
52633 +#define IPRINTK(fmt, args...) ((void)0)
52634 +#endif
52635 +
52636 +#if 1
52637 +#define WPRINTK(fmt, args...)                          \
52638 +       printk(KERN_WARNING "xen_blk: " fmt, ##args)
52639 +#else
52640 +#define WPRINTK(fmt, args...) ((void)0)
52641 +#endif
52642 +
52643 +#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
52644 +
52645 +#if 0
52646 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
52647 +#else
52648 +#define DPRINTK_IOCTL(_f, _a...) ((void)0)
52649 +#endif
52650 +
52651 +struct xlbd_type_info
52652 +{
52653 +       int partn_shift;
52654 +       int disks_per_major;
52655 +       char *devname;
52656 +       char *diskname;
52657 +};
52658 +
52659 +struct xlbd_major_info
52660 +{
52661 +       int major;
52662 +       int index;
52663 +       int usage;
52664 +       struct xlbd_type_info *type;
52665 +};
52666 +
52667 +struct blk_shadow {
52668 +       blkif_request_t req;
52669 +       unsigned long request;
52670 +       unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
52671 +};
52672 +
52673 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
52674 +
52675 +/*
52676 + * We have one of these per vbd, whether ide, scsi or 'other'.  They
52677 + * hang in private_data off the gendisk structure. We may end up
52678 + * putting all kinds of interesting stuff here :-)
52679 + */
52680 +struct blkfront_info
52681 +{
52682 +       struct xenbus_device *xbdev;
52683 +       dev_t dev;
52684 +       struct gendisk *gd;
52685 +       int vdevice;
52686 +       blkif_vdev_t handle;
52687 +       int connected;
52688 +       int ring_ref;
52689 +       blkif_front_ring_t ring;
52690 +       unsigned int evtchn, irq;
52691 +       struct xlbd_major_info *mi;
52692 +       request_queue_t *rq;
52693 +       struct work_struct work;
52694 +       struct gnttab_free_callback callback;
52695 +       struct blk_shadow shadow[BLK_RING_SIZE];
52696 +       unsigned long shadow_free;
52697 +       int feature_barrier;
52698 +
52699 +       /**
52700 +        * The number of people holding this device open.  We won't allow a
52701 +        * hot-unplug unless this is 0.
52702 +        */
52703 +       int users;
52704 +};
52705 +
52706 +extern spinlock_t blkif_io_lock;
52707 +
52708 +extern int blkif_open(struct inode *inode, struct file *filep);
52709 +extern int blkif_release(struct inode *inode, struct file *filep);
52710 +extern int blkif_ioctl(struct inode *inode, struct file *filep,
52711 +                      unsigned command, unsigned long argument);
52712 +extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
52713 +extern int blkif_check(dev_t dev);
52714 +extern int blkif_revalidate(dev_t dev);
52715 +extern void do_blkif_request (request_queue_t *rq);
52716 +
52717 +/* Virtual block-device subsystem. */
52718 +/* Note that xlvbd_add doesn't call add_disk for you: you're expected
52719 +   to call add_disk on info->gd once the disk is properly connected
52720 +   up. */
52721 +int xlvbd_add(blkif_sector_t capacity, int device,
52722 +             u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
52723 +void xlvbd_del(struct blkfront_info *info);
52724 +int xlvbd_barrier(struct blkfront_info *info);
52725 +
52726 +#endif /* __XEN_DRIVERS_BLOCK_H__ */
52727 diff -ruNp linux-2.6.19/drivers/xen/blkfront/vbd.c linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/vbd.c
52728 --- linux-2.6.19/drivers/xen/blkfront/vbd.c     1970-01-01 00:00:00.000000000 +0000
52729 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blkfront/vbd.c   2007-02-02 19:10:45.000000000 +0000
52730 @@ -0,0 +1,375 @@
52731 +/******************************************************************************
52732 + * vbd.c
52733 + * 
52734 + * XenLinux virtual block-device driver (xvd).
52735 + * 
52736 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
52737 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
52738 + * Copyright (c) 2004-2005, Christian Limpach
52739 + * 
52740 + * This program is free software; you can redistribute it and/or
52741 + * modify it under the terms of the GNU General Public License version 2
52742 + * as published by the Free Software Foundation; or, when distributed
52743 + * separately from the Linux kernel or incorporated into other
52744 + * software packages, subject to the following license:
52745 + * 
52746 + * Permission is hereby granted, free of charge, to any person obtaining a copy
52747 + * of this source file (the "Software"), to deal in the Software without
52748 + * restriction, including without limitation the rights to use, copy, modify,
52749 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
52750 + * and to permit persons to whom the Software is furnished to do so, subject to
52751 + * the following conditions:
52752 + * 
52753 + * The above copyright notice and this permission notice shall be included in
52754 + * all copies or substantial portions of the Software.
52755 + * 
52756 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
52757 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
52758 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52759 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52760 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
52761 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
52762 + * IN THE SOFTWARE.
52763 + */
52764 +
52765 +#include "block.h"
52766 +#include <linux/blkdev.h>
52767 +#include <linux/list.h>
52768 +
52769 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
52770 +#include <xen/platform-compat.h>
52771 +#endif
52772 +
52773 +#define BLKIF_MAJOR(dev) ((dev)>>8)
52774 +#define BLKIF_MINOR(dev) ((dev) & 0xff)
52775 +
52776 +/*
52777 + * For convenience we distinguish between ide, scsi and 'other' (i.e.,
52778 + * potentially combinations of the two) in the naming scheme and in a few other
52779 + * places.
52780 + */
52781 +
52782 +#define NUM_IDE_MAJORS 10
52783 +#define NUM_SCSI_MAJORS 17
52784 +#define NUM_VBD_MAJORS 1
52785 +
52786 +static struct xlbd_type_info xlbd_ide_type = {
52787 +       .partn_shift = 6,
52788 +       .disks_per_major = 2,
52789 +       .devname = "ide",
52790 +       .diskname = "hd",
52791 +};
52792 +
52793 +static struct xlbd_type_info xlbd_scsi_type = {
52794 +       .partn_shift = 4,
52795 +       .disks_per_major = 16,
52796 +       .devname = "sd",
52797 +       .diskname = "sd",
52798 +};
52799 +
52800 +static struct xlbd_type_info xlbd_vbd_type = {
52801 +       .partn_shift = 4,
52802 +       .disks_per_major = 16,
52803 +       .devname = "xvd",
52804 +       .diskname = "xvd",
52805 +};
52806 +
52807 +static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
52808 +                                        NUM_VBD_MAJORS];
52809 +
52810 +#define XLBD_MAJOR_IDE_START   0
52811 +#define XLBD_MAJOR_SCSI_START  (NUM_IDE_MAJORS)
52812 +#define XLBD_MAJOR_VBD_START   (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
52813 +
52814 +#define XLBD_MAJOR_IDE_RANGE   XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
52815 +#define XLBD_MAJOR_SCSI_RANGE  XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
52816 +#define XLBD_MAJOR_VBD_RANGE   XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
52817 +
52818 +/* Information about our VBDs. */
52819 +#define MAX_VBDS 64
52820 +static LIST_HEAD(vbds_list);
52821 +
52822 +static struct block_device_operations xlvbd_block_fops =
52823 +{
52824 +       .owner = THIS_MODULE,
52825 +       .open = blkif_open,
52826 +       .release = blkif_release,
52827 +       .ioctl  = blkif_ioctl,
52828 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
52829 +       .getgeo = blkif_getgeo
52830 +#endif
52831 +};
52832 +
52833 +DEFINE_SPINLOCK(blkif_io_lock);
52834 +
52835 +static struct xlbd_major_info *
52836 +xlbd_alloc_major_info(int major, int minor, int index)
52837 +{
52838 +       struct xlbd_major_info *ptr;
52839 +
52840 +       ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
52841 +       if (ptr == NULL)
52842 +               return NULL;
52843 +
52844 +       ptr->major = major;
52845 +
52846 +       switch (index) {
52847 +       case XLBD_MAJOR_IDE_RANGE:
52848 +               ptr->type = &xlbd_ide_type;
52849 +               ptr->index = index - XLBD_MAJOR_IDE_START;
52850 +               break;
52851 +       case XLBD_MAJOR_SCSI_RANGE:
52852 +               ptr->type = &xlbd_scsi_type;
52853 +               ptr->index = index - XLBD_MAJOR_SCSI_START;
52854 +               break;
52855 +       case XLBD_MAJOR_VBD_RANGE:
52856 +               ptr->type = &xlbd_vbd_type;
52857 +               ptr->index = index - XLBD_MAJOR_VBD_START;
52858 +               break;
52859 +       }
52860 +
52861 +       printk("Registering block device major %i\n", ptr->major);
52862 +       if (register_blkdev(ptr->major, ptr->type->devname)) {
52863 +               WPRINTK("can't get major %d with name %s\n",
52864 +                       ptr->major, ptr->type->devname);
52865 +               kfree(ptr);
52866 +               return NULL;
52867 +       }
52868 +
52869 +/*     devfs_mk_dir(ptr->type->devname);*/
52870 +       major_info[index] = ptr;
52871 +       return ptr;
52872 +}
52873 +
52874 +static struct xlbd_major_info *
52875 +xlbd_get_major_info(int vdevice)
52876 +{
52877 +       struct xlbd_major_info *mi;
52878 +       int major, minor, index;
52879 +
52880 +       major = BLKIF_MAJOR(vdevice);
52881 +       minor = BLKIF_MINOR(vdevice);
52882 +
52883 +       switch (major) {
52884 +       case IDE0_MAJOR: index = 0; break;
52885 +       case IDE1_MAJOR: index = 1; break;
52886 +       case IDE2_MAJOR: index = 2; break;
52887 +       case IDE3_MAJOR: index = 3; break;
52888 +       case IDE4_MAJOR: index = 4; break;
52889 +       case IDE5_MAJOR: index = 5; break;
52890 +       case IDE6_MAJOR: index = 6; break;
52891 +       case IDE7_MAJOR: index = 7; break;
52892 +       case IDE8_MAJOR: index = 8; break;
52893 +       case IDE9_MAJOR: index = 9; break;
52894 +       case SCSI_DISK0_MAJOR: index = 10; break;
52895 +       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
52896 +               index = 11 + major - SCSI_DISK1_MAJOR;
52897 +               break;
52898 +        case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
52899 +                index = 18 + major - SCSI_DISK8_MAJOR;
52900 +                break;
52901 +        case SCSI_CDROM_MAJOR: index = 26; break;
52902 +        default: index = 27; break;
52903 +       }
52904 +
52905 +       mi = ((major_info[index] != NULL) ? major_info[index] :
52906 +             xlbd_alloc_major_info(major, minor, index));
52907 +       if (mi)
52908 +               mi->usage++;
52909 +       return mi;
52910 +}
52911 +
52912 +static void
52913 +xlbd_put_major_info(struct xlbd_major_info *mi)
52914 +{
52915 +       mi->usage--;
52916 +       /* XXX: release major if 0 */
52917 +}
52918 +
52919 +static int
52920 +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
52921 +{
52922 +       request_queue_t *rq;
52923 +
52924 +       rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
52925 +       if (rq == NULL)
52926 +               return -1;
52927 +
52928 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
52929 +       elevator_init(rq, "noop");
52930 +#else
52931 +       elevator_init(rq, &elevator_noop);
52932 +#endif
52933 +
52934 +       /* Hard sector size and max sectors impersonate the equiv. hardware. */
52935 +       blk_queue_hardsect_size(rq, sector_size);
52936 +       blk_queue_max_sectors(rq, 512);
52937 +
52938 +       /* Each segment in a request is up to an aligned page in size. */
52939 +       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
52940 +       blk_queue_max_segment_size(rq, PAGE_SIZE);
52941 +
52942 +       /* Ensure a merged request will fit in a single I/O ring slot. */
52943 +       blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
52944 +       blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
52945 +
52946 +       /* Make sure buffer addresses are sector-aligned. */
52947 +       blk_queue_dma_alignment(rq, 511);
52948 +
52949 +       gd->queue = rq;
52950 +
52951 +       return 0;
52952 +}
52953 +
52954 +static int
52955 +xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice,
52956 +                   u16 vdisk_info, u16 sector_size,
52957 +                   struct blkfront_info *info)
52958 +{
52959 +       struct gendisk *gd;
52960 +       struct xlbd_major_info *mi;
52961 +       int nr_minors = 1;
52962 +       int err = -ENODEV;
52963 +       unsigned int offset;
52964 +
52965 +       BUG_ON(info->gd != NULL);
52966 +       BUG_ON(info->mi != NULL);
52967 +       BUG_ON(info->rq != NULL);
52968 +
52969 +       mi = xlbd_get_major_info(vdevice);
52970 +       if (mi == NULL)
52971 +               goto out;
52972 +       info->mi = mi;
52973 +
52974 +       if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
52975 +               nr_minors = 1 << mi->type->partn_shift;
52976 +
52977 +       gd = alloc_disk(nr_minors);
52978 +       if (gd == NULL)
52979 +               goto out;
52980 +
52981 +       offset =  mi->index * mi->type->disks_per_major +
52982 +                       (minor >> mi->type->partn_shift);
52983 +       if (nr_minors > 1) {
52984 +               if (offset < 26) {
52985 +                       sprintf(gd->disk_name, "%s%c",
52986 +                                mi->type->diskname, 'a' + offset );
52987 +               }
52988 +               else {
52989 +                       sprintf(gd->disk_name, "%s%c%c",
52990 +                               mi->type->diskname,
52991 +                               'a' + ((offset/26)-1), 'a' + (offset%26) );
52992 +               }
52993 +       }
52994 +       else {
52995 +               if (offset < 26) {
52996 +                       sprintf(gd->disk_name, "%s%c%d",
52997 +                               mi->type->diskname,
52998 +                               'a' + offset,
52999 +                               minor & ((1 << mi->type->partn_shift) - 1));
53000 +               }
53001 +               else {
53002 +                       sprintf(gd->disk_name, "%s%c%c%d",
53003 +                               mi->type->diskname,
53004 +                               'a' + ((offset/26)-1), 'a' + (offset%26),
53005 +                               minor & ((1 << mi->type->partn_shift) - 1));
53006 +               }
53007 +       }
53008 +
53009 +       gd->major = mi->major;
53010 +       gd->first_minor = minor;
53011 +       gd->fops = &xlvbd_block_fops;
53012 +       gd->private_data = info;
53013 +       gd->driverfs_dev = &(info->xbdev->dev);
53014 +       set_capacity(gd, capacity);
53015 +
53016 +       if (xlvbd_init_blk_queue(gd, sector_size)) {
53017 +               del_gendisk(gd);
53018 +               goto out;
53019 +       }
53020 +
53021 +       info->rq = gd->queue;
53022 +       info->gd = gd;
53023 +
53024 +       if (info->feature_barrier)
53025 +               xlvbd_barrier(info);
53026 +
53027 +       if (vdisk_info & VDISK_READONLY)
53028 +               set_disk_ro(gd, 1);
53029 +
53030 +       if (vdisk_info & VDISK_REMOVABLE)
53031 +               gd->flags |= GENHD_FL_REMOVABLE;
53032 +
53033 +       if (vdisk_info & VDISK_CDROM)
53034 +               gd->flags |= GENHD_FL_CD;
53035 +
53036 +       return 0;
53037 +
53038 + out:
53039 +       if (mi)
53040 +               xlbd_put_major_info(mi);
53041 +       info->mi = NULL;
53042 +       return err;
53043 +}
53044 +
53045 +int
53046 +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
53047 +         u16 sector_size, struct blkfront_info *info)
53048 +{
53049 +       struct block_device *bd;
53050 +       int err = 0;
53051 +
53052 +       info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice));
53053 +
53054 +       bd = bdget(info->dev);
53055 +       if (bd == NULL)
53056 +               return -ENODEV;
53057 +
53058 +       err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice,
53059 +                                 vdisk_info, sector_size, info);
53060 +
53061 +       bdput(bd);
53062 +       return err;
53063 +}
53064 +
53065 +void
53066 +xlvbd_del(struct blkfront_info *info)
53067 +{
53068 +       if (info->mi == NULL)
53069 +               return;
53070 +
53071 +       BUG_ON(info->gd == NULL);
53072 +       del_gendisk(info->gd);
53073 +       put_disk(info->gd);
53074 +       info->gd = NULL;
53075 +
53076 +       xlbd_put_major_info(info->mi);
53077 +       info->mi = NULL;
53078 +
53079 +       BUG_ON(info->rq == NULL);
53080 +       blk_cleanup_queue(info->rq);
53081 +       info->rq = NULL;
53082 +}
53083 +
53084 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
53085 +int
53086 +xlvbd_barrier(struct blkfront_info *info)
53087 +{
53088 +       int err;
53089 +
53090 +       err = blk_queue_ordered(info->rq,
53091 +               info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL);
53092 +       if (err)
53093 +               return err;
53094 +       printk("blkfront: %s: barriers %s\n",
53095 +              info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled");
53096 +       return 0;
53097 +}
53098 +#else
53099 +int
53100 +xlvbd_barrier(struct blkfront_info *info)
53101 +{
53102 +       printk("blkfront: %s: barriers disabled\n", info->gd->disk_name);
53103 +       return -ENOSYS;
53104 +}
53105 +#endif
53106 diff -ruNp linux-2.6.19/drivers/xen/blktap/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/blktap/Makefile
53107 --- linux-2.6.19/drivers/xen/blktap/Makefile    1970-01-01 00:00:00.000000000 +0000
53108 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blktap/Makefile  2007-02-02 19:10:45.000000000 +0000
53109 @@ -0,0 +1,6 @@
53110 +LINUXINCLUDE += -I../xen/include/public/io
53111 +
53112 +obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
53113 +
53114 +blktap-y       := xenbus.o interface.o blktapmain.o 
53115 +
53116 diff -ruNp linux-2.6.19/drivers/xen/blktap/blktap.c linux-2.6.19-xen-3.0.4/drivers/xen/blktap/blktap.c
53117 --- linux-2.6.19/drivers/xen/blktap/blktap.c    1970-01-01 00:00:00.000000000 +0000
53118 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blktap/blktap.c  2007-02-02 19:10:45.000000000 +0000
53119 @@ -0,0 +1,1517 @@
53120 +/******************************************************************************
53121 + * drivers/xen/blktap/blktap.c
53122 + * 
53123 + * Back-end driver for user level virtual block devices. This portion of the
53124 + * driver exports a 'unified' block-device interface that can be accessed
53125 + * by any operating system that implements a compatible front end. Requests
53126 + * are remapped to a user-space memory region.
53127 + *
53128 + * Based on the blkback driver code.
53129 + * 
53130 + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
53131 + *
53132 + * Clean ups and fix ups:
53133 + *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
53134 + *
53135 + * This program is free software; you can redistribute it and/or
53136 + * modify it under the terms of the GNU General Public License version 2
53137 + * as published by the Free Software Foundation; or, when distributed
53138 + * separately from the Linux kernel or incorporated into other
53139 + * software packages, subject to the following license:
53140 + * 
53141 + * Permission is hereby granted, free of charge, to any person obtaining a copy
53142 + * of this source file (the "Software"), to deal in the Software without
53143 + * restriction, including without limitation the rights to use, copy, modify,
53144 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
53145 + * and to permit persons to whom the Software is furnished to do so, subject to
53146 + * the following conditions:
53147 + * 
53148 + * The above copyright notice and this permission notice shall be included in
53149 + * all copies or substantial portions of the Software.
53150 + * 
53151 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
53152 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
53153 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
53154 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53155 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
53156 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
53157 + * IN THE SOFTWARE.
53158 + */
53159 +
53160 +#include <linux/spinlock.h>
53161 +#include <linux/kthread.h>
53162 +#include <linux/list.h>
53163 +#include <asm/hypervisor.h>
53164 +#include "common.h"
53165 +#include <xen/balloon.h>
53166 +#include <linux/kernel.h>
53167 +#include <linux/fs.h>
53168 +#include <linux/mm.h>
53169 +#include <linux/errno.h>
53170 +#include <linux/major.h>
53171 +#include <linux/gfp.h>
53172 +#include <linux/poll.h>
53173 +#include <asm/tlbflush.h>
53174 +#include <linux/devfs_fs_kernel.h>
53175 +
53176 +#define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
53177 +#define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
53178 +
53179 +
53180 +struct class *xen_class;
53181 +EXPORT_SYMBOL_GPL(xen_class);
53182 +
53183 +/*
53184 + * Setup the xen class.  This should probably go in another file, but
53185 + * since blktap is the only user of it so far, it gets to keep it.
53186 + */
53187 +int setup_xen_class(void)
53188 +{
53189 +       int ret;
53190 +
53191 +       if (xen_class)
53192 +               return 0;
53193 +
53194 +       xen_class = class_create(THIS_MODULE, "xen");
53195 +       if ((ret = IS_ERR(xen_class))) {
53196 +               xen_class = NULL;
53197 +               return ret;
53198 +       }
53199 +
53200 +       return 0;
53201 +}
53202 +
53203 +/*
53204 + * The maximum number of requests that can be outstanding at any time
53205 + * is determined by 
53206 + *
53207 + *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
53208 + *
53209 + * where mmap_alloc < MAX_DYNAMIC_MEM.
53210 + *
53211 + * TODO:
53212 + * mmap_alloc is initialised to 2 and should be adjustable on the fly via
53213 + * sysfs.
53214 + */
53215 +#define BLK_RING_SIZE          __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
53216 +#define MAX_DYNAMIC_MEM                BLK_RING_SIZE
53217 +#define MAX_PENDING_REQS       BLK_RING_SIZE
53218 +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
53219 +#define MMAP_VADDR(_start, _req,_seg)                                   \
53220 +        (_start +                                                       \
53221 +         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
53222 +         ((_seg) * PAGE_SIZE))
53223 +static int blkif_reqs = MAX_PENDING_REQS;
53224 +static int mmap_pages = MMAP_PAGES;
53225 +
53226 +#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
53227 +                     * have a bunch of pages reserved for shared
53228 +                     * memory rings.
53229 +                     */
53230 +
53231 +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
53232 +typedef struct domid_translate {
53233 +       unsigned short domid;
53234 +       unsigned short busid;
53235 +} domid_translate_t ;
53236 +
53237 +/*Data struct associated with each of the tapdisk devices*/
53238 +typedef struct tap_blkif {
53239 +       struct vm_area_struct *vma;   /*Shared memory area                   */
53240 +       unsigned long rings_vstart;   /*Kernel memory mapping                */
53241 +       unsigned long user_vstart;    /*User memory mapping                  */
53242 +       unsigned long dev_inuse;      /*One process opens device at a time.  */
53243 +       unsigned long dev_pending;    /*In process of being opened           */
53244 +       unsigned long ring_ok;        /*make this ring->state                */
53245 +       blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
53246 +       wait_queue_head_t wait;       /*for poll                             */
53247 +       unsigned long mode;           /*current switching mode               */
53248 +       int minor;                    /*Minor number for tapdisk device      */
53249 +       pid_t pid;                    /*tapdisk process id                   */
53250 +       enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
53251 +                                                 shutdown                   */
53252 +       unsigned long *idx_map;       /*Record the user ring id to kern 
53253 +                                       [req id, idx] tuple                  */
53254 +       blkif_t *blkif;               /*Associate blkif with tapdev          */
53255 +       struct domid_translate trans; /*Translation from domid to bus.       */
53256 +} tap_blkif_t;
53257 +
53258 +static struct tap_blkif *tapfds[MAX_TAP_DEV];
53259 +static int blktap_next_minor;
53260 +
53261 +static int __init set_blkif_reqs(char *str)
53262 +{
53263 +       get_option(&str, &blkif_reqs);
53264 +       return 1;
53265 +}
53266 +__setup("blkif_reqs=", set_blkif_reqs);
53267 +
53268 +/* Run-time switchable: /sys/module/blktap/parameters/ */
53269 +static unsigned int log_stats = 0;
53270 +static unsigned int debug_lvl = 0;
53271 +module_param(log_stats, int, 0644);
53272 +module_param(debug_lvl, int, 0644);
53273 +
53274 +/*
53275 + * Each outstanding request that we've passed to the lower device layers has a 
53276 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
53277 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
53278 + * response queued for it, with the saved 'id' passed back.
53279 + */
53280 +typedef struct {
53281 +       blkif_t       *blkif;
53282 +       unsigned long  id;
53283 +       unsigned short mem_idx;
53284 +       int            nr_pages;
53285 +       atomic_t       pendcnt;
53286 +       unsigned short operation;
53287 +       int            status;
53288 +       struct list_head free_list;
53289 +       int            inuse;
53290 +} pending_req_t;
53291 +
53292 +static pending_req_t *pending_reqs[MAX_PENDING_REQS];
53293 +static struct list_head pending_free;
53294 +static DEFINE_SPINLOCK(pending_free_lock);
53295 +static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
53296 +static int alloc_pending_reqs;
53297 +
53298 +typedef unsigned int PEND_RING_IDX;
53299 +
53300 +static inline int MASK_PEND_IDX(int i) { 
53301 +       return (i & (MAX_PENDING_REQS-1));
53302 +}
53303 +
53304 +static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
53305 +       return (req - pending_reqs[idx]);
53306 +}
53307 +
53308 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
53309 +
53310 +#define BLKBACK_INVALID_HANDLE (~0)
53311 +
53312 +static struct page **foreign_pages[MAX_DYNAMIC_MEM];
53313 +static inline unsigned long idx_to_kaddr(
53314 +       unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
53315 +{
53316 +       unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
53317 +       unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
53318 +       return (unsigned long)pfn_to_kaddr(pfn);
53319 +}
53320 +
53321 +static unsigned short mmap_alloc = 0;
53322 +static unsigned short mmap_lock = 0;
53323 +static unsigned short mmap_inuse = 0;
53324 +
53325 +/******************************************************************
53326 + * GRANT HANDLES
53327 + */
53328 +
53329 +/* When using grant tables to map a frame for device access then the
53330 + * handle returned must be used to unmap the frame. This is needed to
53331 + * drop the ref count on the frame.
53332 + */
53333 +struct grant_handle_pair
53334 +{
53335 +        grant_handle_t kernel;
53336 +        grant_handle_t user;
53337 +};
53338 +#define INVALID_GRANT_HANDLE   0xFFFF
53339 +
53340 +static struct grant_handle_pair 
53341 +    pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
53342 +#define pending_handle(_id, _idx, _i) \
53343 +    (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
53344 +    + (_i)])
53345 +
53346 +
53347 +static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
53348 +
53349 +#define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
53350 +#define BLKTAP_DEV_DIR  "/dev/xen"
53351 +
53352 +static int blktap_major;
53353 +
53354 +/* blktap IOCTLs: */
53355 +#define BLKTAP_IOCTL_KICK_FE         1
53356 +#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
53357 +#define BLKTAP_IOCTL_SETMODE         3
53358 +#define BLKTAP_IOCTL_SENDPID        4
53359 +#define BLKTAP_IOCTL_NEWINTF        5
53360 +#define BLKTAP_IOCTL_MINOR          6
53361 +#define BLKTAP_IOCTL_MAJOR          7
53362 +#define BLKTAP_QUERY_ALLOC_REQS      8
53363 +#define BLKTAP_IOCTL_FREEINTF        9
53364 +#define BLKTAP_IOCTL_PRINT_IDXS      100  
53365 +
53366 +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
53367 +#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
53368 +#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
53369 +#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
53370 +
53371 +#define BLKTAP_MODE_INTERPOSE \
53372 +           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
53373 +
53374 +
53375 +static inline int BLKTAP_MODE_VALID(unsigned long arg)
53376 +{
53377 +       return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
53378 +               (arg == BLKTAP_MODE_INTERCEPT_FE) ||
53379 +                (arg == BLKTAP_MODE_INTERPOSE   ));
53380 +}
53381 +
53382 +/* Requests passing through the tap to userspace are re-assigned an ID.
53383 + * We must record a mapping between the BE [IDX,ID] tuple and the userspace
53384 + * ring ID. 
53385 + */
53386 +
53387 +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
53388 +{
53389 +        return ((fe_dom << 16) | MASK_PEND_IDX(idx));
53390 +}
53391 +
53392 +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
53393 +{
53394 +        return (PEND_RING_IDX)(id & 0x0000ffff);
53395 +}
53396 +
53397 +extern inline int ID_TO_MIDX(unsigned long id)
53398 +{
53399 +        return (int)(id >> 16);
53400 +}
53401 +
53402 +#define INVALID_REQ 0xdead0000
53403 +
53404 +/*TODO: Convert to a free list*/
53405 +static inline int GET_NEXT_REQ(unsigned long *idx_map)
53406 +{
53407 +       int i;
53408 +       for (i = 0; i < MAX_PENDING_REQS; i++)
53409 +               if (idx_map[i] == INVALID_REQ)
53410 +                       return i;
53411 +
53412 +       return INVALID_REQ;
53413 +}
53414 +
53415 +
53416 +#define BLKTAP_INVALID_HANDLE(_g) \
53417 +    (((_g->kernel) == INVALID_GRANT_HANDLE) &&  \
53418 +     ((_g->user) == INVALID_GRANT_HANDLE))
53419 +
53420 +#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
53421 +    (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
53422 +    } while(0)
53423 +
53424 +
53425 +/******************************************************************
53426 + * BLKTAP VM OPS
53427 + */
53428 +
53429 +static struct page *blktap_nopage(struct vm_area_struct *vma,
53430 +                                 unsigned long address,
53431 +                                 int *type)
53432 +{
53433 +       /*
53434 +        * if the page has not been mapped in by the driver then return
53435 +        * NOPAGE_SIGBUS to the domain.
53436 +        */
53437 +
53438 +       return NOPAGE_SIGBUS;
53439 +}
53440 +
53441 +struct vm_operations_struct blktap_vm_ops = {
53442 +       nopage:   blktap_nopage,
53443 +};
53444 +
53445 +/******************************************************************
53446 + * BLKTAP FILE OPS
53447 + */
53448
53449 +/*Function Declarations*/
53450 +static tap_blkif_t *get_next_free_dev(void);
53451 +static int blktap_open(struct inode *inode, struct file *filp);
53452 +static int blktap_release(struct inode *inode, struct file *filp);
53453 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
53454 +static int blktap_ioctl(struct inode *inode, struct file *filp,
53455 +                        unsigned int cmd, unsigned long arg);
53456 +static unsigned int blktap_poll(struct file *file, poll_table *wait);
53457 +
53458 +static struct file_operations blktap_fops = {
53459 +       .owner   = THIS_MODULE,
53460 +       .poll    = blktap_poll,
53461 +       .ioctl   = blktap_ioctl,
53462 +       .open    = blktap_open,
53463 +       .release = blktap_release,
53464 +       .mmap    = blktap_mmap,
53465 +};
53466 +
53467 +
53468 +static tap_blkif_t *get_next_free_dev(void)
53469 +{
53470 +       tap_blkif_t *info;
53471 +       int minor;
53472 +
53473 +       /*
53474 +        * This is called only from the ioctl, which
53475 +        * means we should always have interrupts enabled.
53476 +        */
53477 +       BUG_ON(irqs_disabled());
53478 +
53479 +       spin_lock_irq(&pending_free_lock);
53480 +
53481 +       /* tapfds[0] is always NULL */
53482 +
53483 +       for (minor = 1; minor < blktap_next_minor; minor++) {
53484 +               info = tapfds[minor];
53485 +               /* we could have failed a previous attempt. */
53486 +               if (!info ||
53487 +                   ((info->dev_inuse == 0) &&
53488 +                    (info->dev_pending == 0)) ) {
53489 +                       info->dev_pending = 1;
53490 +                       goto found;
53491 +               }
53492 +       }
53493 +       info = NULL;
53494 +       minor = -1;
53495 +
53496 +       /*
53497 +        * We didn't find free device. If we can still allocate
53498 +        * more, then we grab the next device minor that is
53499 +        * available.  This is done while we are still under
53500 +        * the protection of the pending_free_lock.
53501 +        */
53502 +       if (blktap_next_minor < MAX_TAP_DEV)
53503 +               minor = blktap_next_minor++;
53504 +found:
53505 +       spin_unlock_irq(&pending_free_lock);
53506 +
53507 +       if (!info && minor > 0) {
53508 +               info = kzalloc(sizeof(*info), GFP_KERNEL);
53509 +               if (unlikely(!info)) {
53510 +                       /*
53511 +                        * If we failed here, try to put back
53512 +                        * the next minor number. But if one
53513 +                        * was just taken, then we just lose this
53514 +                        * minor.  We can try to allocate this
53515 +                        * minor again later.
53516 +                        */
53517 +                       spin_lock_irq(&pending_free_lock);
53518 +                       if (blktap_next_minor == minor+1)
53519 +                               blktap_next_minor--;
53520 +                       spin_unlock_irq(&pending_free_lock);
53521 +                       goto out;
53522 +               }
53523 +
53524 +               info->minor = minor;
53525 +               /*
53526 +                * Make sure that we have a minor before others can
53527 +                * see us.
53528 +                */
53529 +               wmb();
53530 +               tapfds[minor] = info;
53531 +
53532 +               class_device_create(xen_class, NULL,
53533 +                                   MKDEV(blktap_major, minor), NULL,
53534 +                                   "blktap%d", minor);
53535 +               devfs_mk_cdev(MKDEV(blktap_major, minor),
53536 +                       S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", minor);
53537 +       }
53538 +
53539 +out:
53540 +       return info;
53541 +}
53542 +
53543 +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
53544 +{
53545 +       tap_blkif_t *info;
53546 +       int i;
53547 +
53548 +       for (i = 1; i < blktap_next_minor; i++) {
53549 +               info = tapfds[i];
53550 +               if ( info &&
53551 +                    (info->trans.domid == domid) &&
53552 +                    (info->trans.busid == xenbus_id) ) {
53553 +                       info->blkif = blkif;
53554 +                       info->status = RUNNING;
53555 +                       return i;
53556 +               }
53557 +       }
53558 +       return -1;
53559 +}
53560 +
53561 +void signal_tapdisk(int idx) 
53562 +{
53563 +       tap_blkif_t *info;
53564 +       struct task_struct *ptask;
53565 +
53566 +       info = tapfds[idx];
53567 +       if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
53568 +               return;
53569 +
53570 +       if (info->pid > 0) {
53571 +               ptask = find_task_by_pid(info->pid);
53572 +               if (ptask)
53573 +                       info->status = CLEANSHUTDOWN;
53574 +       }
53575 +       info->blkif = NULL;
53576 +
53577 +       return;
53578 +}
53579 +
53580 +static int blktap_open(struct inode *inode, struct file *filp)
53581 +{
53582 +       blkif_sring_t *sring;
53583 +       int idx = iminor(inode) - BLKTAP_MINOR;
53584 +       tap_blkif_t *info;
53585 +       int i;
53586 +       
53587 +       /* ctrl device, treat differently */
53588 +       if (!idx)
53589 +               return 0;
53590 +
53591 +       info = tapfds[idx];
53592 +
53593 +       if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
53594 +               WPRINTK("Unable to open device /dev/xen/blktap%d\n",
53595 +                       idx);
53596 +               return -ENODEV;
53597 +       }
53598 +
53599 +       DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
53600 +       
53601 +       /*Only one process can access device at a time*/
53602 +       if (test_and_set_bit(0, &info->dev_inuse))
53603 +               return -EBUSY;
53604 +
53605 +       info->dev_pending = 0;
53606 +           
53607 +       /* Allocate the fe ring. */
53608 +       sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
53609 +       if (sring == NULL)
53610 +               goto fail_nomem;
53611 +
53612 +       SetPageReserved(virt_to_page(sring));
53613 +    
53614 +       SHARED_RING_INIT(sring);
53615 +       FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
53616 +       
53617 +       filp->private_data = info;
53618 +       info->vma = NULL;
53619 +
53620 +       info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
53621 +                               GFP_KERNEL);
53622 +       
53623 +       if (idx > 0) {
53624 +               init_waitqueue_head(&info->wait);
53625 +               for (i = 0; i < MAX_PENDING_REQS; i++) 
53626 +                       info->idx_map[i] = INVALID_REQ;
53627 +       }
53628 +
53629 +       DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
53630 +       return 0;
53631 +
53632 + fail_nomem:
53633 +       return -ENOMEM;
53634 +}
53635 +
53636 +static int blktap_release(struct inode *inode, struct file *filp)
53637 +{
53638 +       tap_blkif_t *info = filp->private_data;
53639 +       
53640 +       /* check for control device */
53641 +       if (!info)
53642 +               return 0;
53643 +
53644 +       info->dev_inuse = 0;
53645 +       DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
53646 +
53647 +       /* Free the ring page. */
53648 +       ClearPageReserved(virt_to_page(info->ufe_ring.sring));
53649 +       free_page((unsigned long) info->ufe_ring.sring);
53650 +
53651 +       /* Clear any active mappings and free foreign map table */
53652 +       if (info->vma) {
53653 +               zap_page_range(
53654 +                       info->vma, info->vma->vm_start, 
53655 +                       info->vma->vm_end - info->vma->vm_start, NULL);
53656 +               info->vma = NULL;
53657 +       }
53658 +       
53659 +       if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
53660 +               if (info->blkif->xenblkd != NULL) {
53661 +                       kthread_stop(info->blkif->xenblkd);
53662 +                       info->blkif->xenblkd = NULL;
53663 +               }
53664 +               info->status = CLEANSHUTDOWN;
53665 +       }       
53666 +       return 0;
53667 +}
53668 +
53669 +
53670 +/* Note on mmap:
53671 + * We need to map pages to user space in a way that will allow the block
53672 + * subsystem set up direct IO to them.  This couldn't be done before, because
53673 + * there isn't really a sane way to translate a user virtual address down to a 
53674 + * physical address when the page belongs to another domain.
53675 + *
53676 + * My first approach was to map the page in to kernel memory, add an entry
53677 + * for it in the physical frame list (using alloc_lomem_region as in blkback)
53678 + * and then attempt to map that page up to user space.  This is disallowed
53679 + * by xen though, which realizes that we don't really own the machine frame
53680 + * underlying the physical page.
53681 + *
53682 + * The new approach is to provide explicit support for this in xen linux.
53683 + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
53684 + * mapped from other vms.  vma->vm_private_data is set up as a mapping 
53685 + * from pages to actual page structs.  There is a new clause in get_user_pages
53686 + * that does the right thing for this sort of mapping.
53687 + */
53688 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
53689 +{
53690 +       int size;
53691 +       struct page **map;
53692 +       int i;
53693 +       tap_blkif_t *info = filp->private_data;
53694 +
53695 +       if (info == NULL) {
53696 +               WPRINTK("blktap: mmap, retrieving idx failed\n");
53697 +               return -ENOMEM;
53698 +       }
53699 +       
53700 +       vma->vm_flags |= VM_RESERVED;
53701 +       vma->vm_ops = &blktap_vm_ops;
53702 +
53703 +       size = vma->vm_end - vma->vm_start;
53704 +       if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
53705 +               WPRINTK("you _must_ map exactly %d pages!\n",
53706 +                      mmap_pages + RING_PAGES);
53707 +               return -EAGAIN;
53708 +       }
53709 +
53710 +       size >>= PAGE_SHIFT;
53711 +       info->rings_vstart = vma->vm_start;
53712 +       info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
53713 +    
53714 +       /* Map the ring pages to the start of the region and reserve it. */
53715 +       if (remap_pfn_range(vma, vma->vm_start, 
53716 +                           __pa(info->ufe_ring.sring) >> PAGE_SHIFT, 
53717 +                           PAGE_SIZE, vma->vm_page_prot)) {
53718 +               WPRINTK("Mapping user ring failed!\n");
53719 +               goto fail;
53720 +       }
53721 +
53722 +       /* Mark this VM as containing foreign pages, and set up mappings. */
53723 +       map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
53724 +                     * sizeof(struct page_struct*),
53725 +                     GFP_KERNEL);
53726 +       if (map == NULL) {
53727 +               WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
53728 +               goto fail;
53729 +       }
53730 +
53731 +       for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
53732 +               map[i] = NULL;
53733 +    
53734 +       vma->vm_private_data = map;
53735 +       vma->vm_flags |= VM_FOREIGN;
53736 +
53737 +       info->vma = vma;
53738 +       info->ring_ok = 1;
53739 +       return 0;
53740 + fail:
53741 +       /* Clear any active mappings. */
53742 +       zap_page_range(vma, vma->vm_start, 
53743 +                      vma->vm_end - vma->vm_start, NULL);
53744 +
53745 +       return -ENOMEM;
53746 +}
53747 +
53748 +
53749 +static int blktap_ioctl(struct inode *inode, struct file *filp,
53750 +                        unsigned int cmd, unsigned long arg)
53751 +{
53752 +       tap_blkif_t *info = filp->private_data;
53753 +
53754 +       switch(cmd) {
53755 +       case BLKTAP_IOCTL_KICK_FE: 
53756 +       {
53757 +               /* There are fe messages to process. */
53758 +               return blktap_read_ufe_ring(info);
53759 +       }
53760 +       case BLKTAP_IOCTL_SETMODE:
53761 +       {
53762 +               if (info) {
53763 +                       if (BLKTAP_MODE_VALID(arg)) {
53764 +                               info->mode = arg;
53765 +                               /* XXX: may need to flush rings here. */
53766 +                               DPRINTK("blktap: set mode to %lx\n", 
53767 +                                      arg);
53768 +                               return 0;
53769 +                       }
53770 +               }
53771 +               return 0;
53772 +       }
53773 +       case BLKTAP_IOCTL_PRINT_IDXS:
53774 +        {
53775 +               if (info) {
53776 +                       printk("User Rings: \n-----------\n");
53777 +                       printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
53778 +                               "| req_prod: %2d, rsp_prod: %2d\n",
53779 +                               info->ufe_ring.rsp_cons,
53780 +                               info->ufe_ring.req_prod_pvt,
53781 +                               info->ufe_ring.sring->req_prod,
53782 +                               info->ufe_ring.sring->rsp_prod);
53783 +               }
53784 +               return 0;
53785 +        }
53786 +       case BLKTAP_IOCTL_SENDPID:
53787 +       {
53788 +               if (info) {
53789 +                       info->pid = (pid_t)arg;
53790 +                       DPRINTK("blktap: pid received %d\n", 
53791 +                              info->pid);
53792 +               }
53793 +               return 0;
53794 +       }
53795 +       case BLKTAP_IOCTL_NEWINTF:
53796 +       {               
53797 +               uint64_t val = (uint64_t)arg;
53798 +               domid_translate_t *tr = (domid_translate_t *)&val;
53799 +
53800 +               DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
53801 +                      tr->domid, tr->busid);
53802 +               info = get_next_free_dev();
53803 +               if (!info) {
53804 +                       WPRINTK("Error initialising /dev/xen/blktap - "
53805 +                               "No more devices\n");
53806 +                       return -1;
53807 +               }
53808 +               info->trans.domid = tr->domid;
53809 +               info->trans.busid = tr->busid;
53810 +               return info->minor;
53811 +       }
53812 +       case BLKTAP_IOCTL_FREEINTF:
53813 +       {
53814 +               unsigned long dev = arg;
53815 +               unsigned long flags;
53816 +
53817 +               info = tapfds[dev];
53818 +
53819 +               if ((dev > MAX_TAP_DEV) || !info)
53820 +                       return 0; /* should this be an error? */
53821 +
53822 +               spin_lock_irqsave(&pending_free_lock, flags);
53823 +               if (info->dev_pending)
53824 +                       info->dev_pending = 0;
53825 +               spin_unlock_irqrestore(&pending_free_lock, flags);
53826 +
53827 +               return 0;
53828 +       }
53829 +       case BLKTAP_IOCTL_MINOR:
53830 +       {
53831 +               unsigned long dev = arg;
53832 +
53833 +               info = tapfds[dev];
53834 +
53835 +               if ((dev > MAX_TAP_DEV) || !info)
53836 +                       return -EINVAL;
53837 +
53838 +               return info->minor;
53839 +       }
53840 +       case BLKTAP_IOCTL_MAJOR:
53841 +               return blktap_major;
53842 +
53843 +       case BLKTAP_QUERY_ALLOC_REQS:
53844 +       {
53845 +               WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
53846 +                      alloc_pending_reqs, blkif_reqs);
53847 +               return (alloc_pending_reqs/blkif_reqs) * 100;
53848 +       }
53849 +       }
53850 +       return -ENOIOCTLCMD;
53851 +}
53852 +
53853 +static unsigned int blktap_poll(struct file *filp, poll_table *wait)
53854 +{
53855 +       tap_blkif_t *info = filp->private_data;
53856 +       
53857 +       /* do not work on the control device */
53858 +       if (!info)
53859 +               return 0;
53860 +
53861 +       poll_wait(filp, &info->wait, wait);
53862 +       if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
53863 +               RING_PUSH_REQUESTS(&info->ufe_ring);
53864 +               return POLLIN | POLLRDNORM;
53865 +       }
53866 +       return 0;
53867 +}
53868 +
53869 +void blktap_kick_user(int idx)
53870 +{
53871 +       tap_blkif_t *info;
53872 +
53873 +       info = tapfds[idx];
53874 +
53875 +       if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
53876 +               return;
53877 +
53878 +       wake_up_interruptible(&info->wait);
53879 +
53880 +       return;
53881 +}
53882 +
53883 +static int do_block_io_op(blkif_t *blkif);
53884 +static void dispatch_rw_block_io(blkif_t *blkif,
53885 +                                blkif_request_t *req,
53886 +                                pending_req_t *pending_req);
53887 +static void make_response(blkif_t *blkif, unsigned long id, 
53888 +                          unsigned short op, int st);
53889 +
53890 +/******************************************************************
53891 + * misc small helpers
53892 + */
53893 +static int req_increase(void)
53894 +{
53895 +       int i, j;
53896 +
53897 +       if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
53898 +               return -EINVAL;
53899 +
53900 +       pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
53901 +                                           * blkif_reqs, GFP_KERNEL);
53902 +       foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
53903 +
53904 +       if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
53905 +               goto out_of_memory;
53906 +
53907 +       DPRINTK("%s: reqs=%d, pages=%d\n",
53908 +               __FUNCTION__, blkif_reqs, mmap_pages);
53909 +
53910 +       for (i = 0; i < MAX_PENDING_REQS; i++) {
53911 +               list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
53912 +                             &pending_free);
53913 +               pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
53914 +               for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
53915 +                       BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
53916 +                                                                i, j));
53917 +       }
53918 +
53919 +       mmap_alloc++;
53920 +       DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
53921 +       return 0;
53922 +
53923 + out_of_memory:
53924 +       free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
53925 +       kfree(pending_reqs[mmap_alloc]);
53926 +       WPRINTK("%s: out of memory\n", __FUNCTION__);
53927 +       return -ENOMEM;
53928 +}
53929 +
53930 +static void mmap_req_del(int mmap)
53931 +{
53932 +       BUG_ON(!spin_is_locked(&pending_free_lock));
53933 +
53934 +       kfree(pending_reqs[mmap]);
53935 +       pending_reqs[mmap] = NULL;
53936 +
53937 +       free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
53938 +       foreign_pages[mmap] = NULL;
53939 +
53940 +       mmap_lock = 0;
53941 +       DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
53942 +       mmap_alloc--;
53943 +}
53944 +
53945 +static pending_req_t* alloc_req(void)
53946 +{
53947 +       pending_req_t *req = NULL;
53948 +       unsigned long flags;
53949 +
53950 +       spin_lock_irqsave(&pending_free_lock, flags);
53951 +
53952 +       if (!list_empty(&pending_free)) {
53953 +               req = list_entry(pending_free.next, pending_req_t, free_list);
53954 +               list_del(&req->free_list);
53955 +       }
53956 +
53957 +       if (req) {
53958 +               req->inuse = 1;
53959 +               alloc_pending_reqs++;
53960 +       }
53961 +       spin_unlock_irqrestore(&pending_free_lock, flags);
53962 +
53963 +       return req;
53964 +}
53965 +
53966 +static void free_req(pending_req_t *req)
53967 +{
53968 +       unsigned long flags;
53969 +       int was_empty;
53970 +
53971 +       spin_lock_irqsave(&pending_free_lock, flags);
53972 +
53973 +       alloc_pending_reqs--;
53974 +       req->inuse = 0;
53975 +       if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
53976 +               mmap_inuse--;
53977 +               if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
53978 +               spin_unlock_irqrestore(&pending_free_lock, flags);
53979 +               return;
53980 +       }
53981 +       was_empty = list_empty(&pending_free);
53982 +       list_add(&req->free_list, &pending_free);
53983 +
53984 +       spin_unlock_irqrestore(&pending_free_lock, flags);
53985 +
53986 +       if (was_empty)
53987 +               wake_up(&pending_free_wq);
53988 +}
53989 +
53990 +static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
53991 +                           int tapidx)
53992 +{
53993 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
53994 +       unsigned int i, invcount = 0;
53995 +       struct grant_handle_pair *khandle;
53996 +       uint64_t ptep;
53997 +       int ret, mmap_idx;
53998 +       unsigned long kvaddr, uvaddr;
53999 +       tap_blkif_t *info;
54000 +       
54001 +
54002 +       info = tapfds[tapidx];
54003 +
54004 +       if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
54005 +               WPRINTK("fast_flush: Couldn't get info!\n");
54006 +               return;
54007 +       }
54008 +
54009 +       if (info->vma != NULL &&
54010 +           xen_feature(XENFEAT_auto_translated_physmap)) {
54011 +               down_write(&info->vma->vm_mm->mmap_sem);
54012 +               zap_page_range(info->vma, 
54013 +                              MMAP_VADDR(info->user_vstart, u_idx, 0), 
54014 +                              req->nr_pages << PAGE_SHIFT, NULL);
54015 +               up_write(&info->vma->vm_mm->mmap_sem);
54016 +       }
54017 +
54018 +       mmap_idx = req->mem_idx;
54019 +
54020 +       for (i = 0; i < req->nr_pages; i++) {
54021 +               kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
54022 +               uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
54023 +
54024 +               khandle = &pending_handle(mmap_idx, k_idx, i);
54025 +
54026 +               if (khandle->kernel != INVALID_GRANT_HANDLE) {
54027 +                       gnttab_set_unmap_op(&unmap[invcount],
54028 +                                           idx_to_kaddr(mmap_idx, k_idx, i),
54029 +                                           GNTMAP_host_map, khandle->kernel);
54030 +                       invcount++;
54031 +               }
54032 +
54033 +               if (khandle->user != INVALID_GRANT_HANDLE) {
54034 +                       BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
54035 +                       if (create_lookup_pte_addr(
54036 +                               info->vma->vm_mm,
54037 +                               MMAP_VADDR(info->user_vstart, u_idx, i),
54038 +                               &ptep) !=0) {
54039 +                               WPRINTK("Couldn't get a pte addr!\n");
54040 +                               return;
54041 +                       }
54042 +
54043 +                       gnttab_set_unmap_op(&unmap[invcount], ptep,
54044 +                                           GNTMAP_host_map
54045 +                                           | GNTMAP_application_map
54046 +                                           | GNTMAP_contains_pte,
54047 +                                           khandle->user);
54048 +                       invcount++;
54049 +               }
54050 +
54051 +               BLKTAP_INVALIDATE_HANDLE(khandle);
54052 +       }
54053 +       ret = HYPERVISOR_grant_table_op(
54054 +               GNTTABOP_unmap_grant_ref, unmap, invcount);
54055 +       BUG_ON(ret);
54056 +       
54057 +       if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
54058 +               zap_page_range(info->vma, 
54059 +                              MMAP_VADDR(info->user_vstart, u_idx, 0), 
54060 +                              req->nr_pages << PAGE_SHIFT, NULL);
54061 +}
54062 +
54063 +/******************************************************************
54064 + * SCHEDULER FUNCTIONS
54065 + */
54066 +
54067 +static void print_stats(blkif_t *blkif)
54068 +{
54069 +       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
54070 +              current->comm, blkif->st_oo_req,
54071 +              blkif->st_rd_req, blkif->st_wr_req);
54072 +       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
54073 +       blkif->st_rd_req = 0;
54074 +       blkif->st_wr_req = 0;
54075 +       blkif->st_oo_req = 0;
54076 +}
54077 +
54078 +int tap_blkif_schedule(void *arg)
54079 +{
54080 +       blkif_t *blkif = arg;
54081 +
54082 +       blkif_get(blkif);
54083 +
54084 +       if (debug_lvl)
54085 +               printk(KERN_DEBUG "%s: started\n", current->comm);
54086 +
54087 +       while (!kthread_should_stop()) {
54088 +               wait_event_interruptible(
54089 +                       blkif->wq,
54090 +                       blkif->waiting_reqs || kthread_should_stop());
54091 +               wait_event_interruptible(
54092 +                       pending_free_wq,
54093 +                       !list_empty(&pending_free) || kthread_should_stop());
54094 +
54095 +               blkif->waiting_reqs = 0;
54096 +               smp_mb(); /* clear flag *before* checking for work */
54097 +
54098 +               if (do_block_io_op(blkif))
54099 +                       blkif->waiting_reqs = 1;
54100 +
54101 +               if (log_stats && time_after(jiffies, blkif->st_print))
54102 +                       print_stats(blkif);
54103 +       }
54104 +
54105 +       if (log_stats)
54106 +               print_stats(blkif);
54107 +       if (debug_lvl)
54108 +               printk(KERN_DEBUG "%s: exiting\n", current->comm);
54109 +
54110 +       blkif->xenblkd = NULL;
54111 +       blkif_put(blkif);
54112 +
54113 +       return 0;
54114 +}
54115 +
54116 +/******************************************************************
54117 + * COMPLETION CALLBACK -- Called by user level ioctl()
54118 + */
54119 +
54120 +static int blktap_read_ufe_ring(tap_blkif_t *info)
54121 +{
54122 +       /* This is called to read responses from the UFE ring. */
54123 +       RING_IDX i, j, rp;
54124 +       blkif_response_t *resp;
54125 +       blkif_t *blkif=NULL;
54126 +       int pending_idx, usr_idx, mmap_idx;
54127 +       pending_req_t *pending_req;
54128 +       
54129 +       if (!info)
54130 +               return 0;
54131 +
54132 +       /* We currently only forward packets in INTERCEPT_FE mode. */
54133 +       if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
54134 +               return 0;
54135 +
54136 +       /* for each outstanding message on the UFEring  */
54137 +       rp = info->ufe_ring.sring->rsp_prod;
54138 +       rmb();
54139 +        
54140 +       for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
54141 +               blkif_response_t res;
54142 +               resp = RING_GET_RESPONSE(&info->ufe_ring, i);
54143 +               memcpy(&res, resp, sizeof(res));
54144 +               mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
54145 +               ++info->ufe_ring.rsp_cons;
54146 +
54147 +               /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
54148 +               usr_idx = (int)res.id;
54149 +               pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
54150 +               mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
54151 +
54152 +               if ( (mmap_idx >= mmap_alloc) || 
54153 +                  (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
54154 +                       WPRINTK("Incorrect req map"
54155 +                              "[%d], internal map [%d,%d (%d)]\n", 
54156 +                              usr_idx, mmap_idx, 
54157 +                              ID_TO_IDX(info->idx_map[usr_idx]),
54158 +                              MASK_PEND_IDX(
54159 +                                      ID_TO_IDX(info->idx_map[usr_idx])));
54160 +
54161 +               pending_req = &pending_reqs[mmap_idx][pending_idx];
54162 +               blkif = pending_req->blkif;
54163 +
54164 +               for (j = 0; j < pending_req->nr_pages; j++) {
54165 +
54166 +                       unsigned long kvaddr, uvaddr;
54167 +                       struct page **map = info->vma->vm_private_data;
54168 +                       struct page *pg;
54169 +                       int offset;
54170 +
54171 +                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
54172 +                       kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
54173 +
54174 +                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
54175 +                       ClearPageReserved(pg);
54176 +                       offset = (uvaddr - info->vma->vm_start) 
54177 +                               >> PAGE_SHIFT;
54178 +                       map[offset] = NULL;
54179 +               }
54180 +               fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
54181 +               info->idx_map[usr_idx] = INVALID_REQ;
54182 +               make_response(blkif, pending_req->id, res.operation,
54183 +                             res.status);
54184 +               blkif_put(pending_req->blkif);
54185 +               free_req(pending_req);
54186 +       }
54187 +               
54188 +       return 0;
54189 +}
54190 +
54191 +
54192 +/******************************************************************************
54193 + * NOTIFICATION FROM GUEST OS.
54194 + */
54195 +
54196 +static void blkif_notify_work(blkif_t *blkif)
54197 +{
54198 +       blkif->waiting_reqs = 1;
54199 +       wake_up(&blkif->wq);
54200 +}
54201 +
54202 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
54203 +{
54204 +       blkif_notify_work(dev_id);
54205 +       return IRQ_HANDLED;
54206 +}
54207 +
54208 +
54209 +
54210 +/******************************************************************
54211 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
54212 + */
54213 +static int print_dbug = 1;
54214 +static int do_block_io_op(blkif_t *blkif)
54215 +{
54216 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
54217 +       blkif_request_t req;
54218 +       pending_req_t *pending_req;
54219 +       RING_IDX rc, rp;
54220 +       int more_to_do = 0;
54221 +       tap_blkif_t *info;
54222 +
54223 +       rc = blk_ring->req_cons;
54224 +       rp = blk_ring->sring->req_prod;
54225 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
54226 +
54227 +       /*Check blkif has corresponding UE ring*/
54228 +       if (blkif->dev_num < 0) {
54229 +               /*oops*/
54230 +               if (print_dbug) {
54231 +                       WPRINTK("Corresponding UE " 
54232 +                              "ring does not exist!\n");
54233 +                       print_dbug = 0; /*We only print this message once*/
54234 +               }
54235 +               return 0;
54236 +       }
54237 +
54238 +       info = tapfds[blkif->dev_num];
54239 +
54240 +       if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
54241 +               if (print_dbug) {
54242 +                       WPRINTK("Can't get UE info!\n");
54243 +                       print_dbug = 0;
54244 +               }
54245 +               return 0;
54246 +       }
54247 +
54248 +       while (rc != rp) {
54249 +               
54250 +               if (RING_FULL(&info->ufe_ring)) {
54251 +                       WPRINTK("RING_FULL! More to do\n");
54252 +                       more_to_do = 1;
54253 +                       break;
54254 +               }
54255 +               
54256 +               if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
54257 +                       WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
54258 +                              " More to do\n");
54259 +                       more_to_do = 1;
54260 +                       break;          
54261 +               }
54262 +
54263 +               pending_req = alloc_req();
54264 +               if (NULL == pending_req) {
54265 +                       blkif->st_oo_req++;
54266 +                       more_to_do = 1;
54267 +                       break;
54268 +               }
54269 +
54270 +               memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
54271 +               blk_ring->req_cons = ++rc; /* before make_response() */ 
54272 +
54273 +               switch (req.operation) {
54274 +               case BLKIF_OP_READ:
54275 +                       blkif->st_rd_req++;
54276 +                       dispatch_rw_block_io(blkif, &req, pending_req);
54277 +                       break;
54278 +
54279 +               case BLKIF_OP_WRITE:
54280 +                       blkif->st_wr_req++;
54281 +                       dispatch_rw_block_io(blkif, &req, pending_req);
54282 +                       break;
54283 +
54284 +               default:
54285 +                       WPRINTK("unknown operation [%d]\n",
54286 +                               req.operation);
54287 +                       make_response(blkif, req.id, req.operation,
54288 +                                     BLKIF_RSP_ERROR);
54289 +                       free_req(pending_req);
54290 +                       break;
54291 +               }
54292 +       }
54293 +               
54294 +       blktap_kick_user(blkif->dev_num);
54295 +
54296 +       return more_to_do;
54297 +}
54298 +
54299 +static void dispatch_rw_block_io(blkif_t *blkif,
54300 +                                blkif_request_t *req,
54301 +                                pending_req_t *pending_req)
54302 +{
54303 +       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
54304 +       int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
54305 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
54306 +       unsigned int nseg;
54307 +       int ret, i;
54308 +       tap_blkif_t *info;
54309 +       uint64_t sector;
54310 +       blkif_request_t *target;
54311 +       int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
54312 +       int usr_idx;
54313 +       uint16_t mmap_idx = pending_req->mem_idx;
54314 +
54315 +       if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
54316 +               goto fail_response;
54317 +
54318 +       info = tapfds[blkif->dev_num];
54319 +       if (info == NULL)
54320 +               goto fail_response;
54321 +
54322 +       /* Check we have space on user ring - should never fail. */
54323 +       usr_idx = GET_NEXT_REQ(info->idx_map);
54324 +       if (usr_idx == INVALID_REQ) {
54325 +               BUG();
54326 +               goto fail_response;
54327 +       }
54328 +
54329 +       /* Check that number of segments is sane. */
54330 +       nseg = req->nr_segments;
54331 +       if ( unlikely(nseg == 0) || 
54332 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
54333 +               WPRINTK("Bad number of segments in request (%d)\n", nseg);
54334 +               goto fail_response;
54335 +       }
54336 +       
54337 +       /* Make sure userspace is ready. */
54338 +       if (!info->ring_ok) {
54339 +               WPRINTK("blktap: ring not ready for requests!\n");
54340 +               goto fail_response;
54341 +       }
54342 +
54343 +       if (RING_FULL(&info->ufe_ring)) {
54344 +               WPRINTK("blktap: fe_ring is full, can't add "
54345 +                       "IO Request will be dropped. %d %d\n",
54346 +                       RING_SIZE(&info->ufe_ring),
54347 +                       RING_SIZE(&blkif->blk_ring));
54348 +               goto fail_response;
54349 +       }
54350 +
54351 +       pending_req->blkif     = blkif;
54352 +       pending_req->id        = req->id;
54353 +       pending_req->operation = operation;
54354 +       pending_req->status    = BLKIF_RSP_OKAY;
54355 +       pending_req->nr_pages  = nseg;
54356 +       op = 0;
54357 +       for (i = 0; i < nseg; i++) {
54358 +               unsigned long uvaddr;
54359 +               unsigned long kvaddr;
54360 +               uint64_t ptep;
54361 +               uint32_t flags;
54362 +
54363 +               uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
54364 +               kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
54365 +
54366 +               sector = req->sector_number + ((PAGE_SIZE / 512) * i);
54367 +               if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
54368 +                       WPRINTK("BLKTAP: Sector request greater" 
54369 +                              "than size\n");
54370 +                       WPRINTK("BLKTAP: %s request sector" 
54371 +                              "[%llu,%llu], Total [%llu]\n",
54372 +                              (req->operation == 
54373 +                               BLKIF_OP_WRITE ? "WRITE" : "READ"),
54374 +                               (long long unsigned) sector,
54375 +                               (long long unsigned) sector>>9,
54376 +                               (long long unsigned) blkif->sectors);
54377 +               }
54378 +
54379 +               flags = GNTMAP_host_map;
54380 +               if (operation == WRITE)
54381 +                       flags |= GNTMAP_readonly;
54382 +               gnttab_set_map_op(&map[op], kvaddr, flags,
54383 +                                 req->seg[i].gref, blkif->domid);
54384 +               op++;
54385 +
54386 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
54387 +                       /* Now map it to user. */
54388 +                       ret = create_lookup_pte_addr(info->vma->vm_mm, 
54389 +                                                    uvaddr, &ptep);
54390 +                       if (ret) {
54391 +                               WPRINTK("Couldn't get a pte addr!\n");
54392 +                               goto fail_flush;
54393 +                       }
54394 +
54395 +                       flags = GNTMAP_host_map | GNTMAP_application_map
54396 +                               | GNTMAP_contains_pte;
54397 +                       if (operation == WRITE)
54398 +                               flags |= GNTMAP_readonly;
54399 +                       gnttab_set_map_op(&map[op], ptep, flags,
54400 +                                         req->seg[i].gref, blkif->domid);
54401 +                       op++;
54402 +               }
54403 +       }
54404 +
54405 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
54406 +       BUG_ON(ret);
54407 +
54408 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
54409 +               for (i = 0; i < (nseg*2); i+=2) {
54410 +                       unsigned long uvaddr;
54411 +                       unsigned long kvaddr;
54412 +                       unsigned long offset;
54413 +                       struct page *pg;
54414 +
54415 +                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
54416 +                       kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
54417 +
54418 +                       if (unlikely(map[i].status != 0)) {
54419 +                               WPRINTK("invalid kernel buffer -- "
54420 +                                       "could not remap it\n");
54421 +                               ret |= 1;
54422 +                               map[i].handle = INVALID_GRANT_HANDLE;
54423 +                       }
54424 +
54425 +                       if (unlikely(map[i+1].status != 0)) {
54426 +                               WPRINTK("invalid user buffer -- "
54427 +                                       "could not remap it\n");
54428 +                               ret |= 1;
54429 +                               map[i+1].handle = INVALID_GRANT_HANDLE;
54430 +                       }
54431 +
54432 +                       pending_handle(mmap_idx, pending_idx, i/2).kernel 
54433 +                               = map[i].handle;
54434 +                       pending_handle(mmap_idx, pending_idx, i/2).user   
54435 +                               = map[i+1].handle;
54436 +
54437 +                       if (ret)
54438 +                               continue;
54439 +
54440 +                       set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
54441 +                                           FOREIGN_FRAME(map[i].dev_bus_addr
54442 +                                                         >> PAGE_SHIFT));
54443 +                       offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
54444 +                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
54445 +                       ((struct page **)info->vma->vm_private_data)[offset] =
54446 +                               pg;
54447 +               }
54448 +       } else {
54449 +               for (i = 0; i < nseg; i++) {
54450 +                       unsigned long uvaddr;
54451 +                       unsigned long kvaddr;
54452 +                       unsigned long offset;
54453 +                       struct page *pg;
54454 +
54455 +                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
54456 +                       kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
54457 +
54458 +                       if (unlikely(map[i].status != 0)) {
54459 +                               WPRINTK("invalid kernel buffer -- "
54460 +                                       "could not remap it\n");
54461 +                               ret |= 1;
54462 +                               map[i].handle = INVALID_GRANT_HANDLE;
54463 +                       }
54464 +
54465 +                       pending_handle(mmap_idx, pending_idx, i).kernel 
54466 +                               = map[i].handle;
54467 +
54468 +                       if (ret)
54469 +                               continue;
54470 +
54471 +                       offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
54472 +                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
54473 +                       ((struct page **)info->vma->vm_private_data)[offset] =
54474 +                               pg;
54475 +               }
54476 +       }
54477 +
54478 +       if (ret)
54479 +               goto fail_flush;
54480 +
54481 +       if (xen_feature(XENFEAT_auto_translated_physmap))
54482 +               down_write(&info->vma->vm_mm->mmap_sem);
54483 +       /* Mark mapped pages as reserved: */
54484 +       for (i = 0; i < req->nr_segments; i++) {
54485 +               unsigned long kvaddr;
54486 +               struct page *pg;
54487 +
54488 +               kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
54489 +               pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
54490 +               SetPageReserved(pg);
54491 +               if (xen_feature(XENFEAT_auto_translated_physmap)) {
54492 +                       ret = vm_insert_page(info->vma,
54493 +                                            MMAP_VADDR(info->user_vstart,
54494 +                                                       usr_idx, i), pg);
54495 +                       if (ret) {
54496 +                               up_write(&info->vma->vm_mm->mmap_sem);
54497 +                               goto fail_flush;
54498 +                       }
54499 +               }
54500 +       }
54501 +       if (xen_feature(XENFEAT_auto_translated_physmap))
54502 +               up_write(&info->vma->vm_mm->mmap_sem);
54503 +       
54504 +       /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
54505 +       info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
54506 +
54507 +       blkif_get(blkif);
54508 +       /* Finally, write the request message to the user ring. */
54509 +       target = RING_GET_REQUEST(&info->ufe_ring,
54510 +                                 info->ufe_ring.req_prod_pvt);
54511 +       memcpy(target, req, sizeof(*req));
54512 +       target->id = usr_idx;
54513 +       wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
54514 +       info->ufe_ring.req_prod_pvt++;
54515 +       return;
54516 +
54517 + fail_flush:
54518 +       WPRINTK("Reached Fail_flush\n");
54519 +       fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
54520 + fail_response:
54521 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
54522 +       free_req(pending_req);
54523 +} 
54524 +
54525 +
54526 +
54527 +/******************************************************************
54528 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
54529 + */
54530 +
54531 +
54532 +static void make_response(blkif_t *blkif, unsigned long id, 
54533 +                          unsigned short op, int st)
54534 +{
54535 +       blkif_response_t *resp;
54536 +       unsigned long     flags;
54537 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
54538 +       int more_to_do = 0;
54539 +       int notify;
54540 +
54541 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
54542 +       /* Place on the response ring for the relevant domain. */ 
54543 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
54544 +       resp->id        = id;
54545 +       resp->operation = op;
54546 +       resp->status    = st;
54547 +       blk_ring->rsp_prod_pvt++;
54548 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
54549 +
54550 +       if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
54551 +               /*
54552 +                * Tail check for pending requests. Allows frontend to avoid
54553 +                * notifications if requests are already in flight (lower
54554 +                * overheads and promotes batching).
54555 +                */
54556 +               RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
54557 +       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
54558 +               more_to_do = 1;
54559 +
54560 +       }       
54561 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
54562 +       if (more_to_do)
54563 +               blkif_notify_work(blkif);
54564 +       if (notify)
54565 +               notify_remote_via_irq(blkif->irq);
54566 +}
54567 +
54568 +static int __init blkif_init(void)
54569 +{
54570 +       int i,ret,blktap_dir;
54571 +
54572 +       if (!is_running_on_xen())
54573 +               return -ENODEV;
54574 +
54575 +       INIT_LIST_HEAD(&pending_free);
54576 +        for(i = 0; i < 2; i++) {
54577 +               ret = req_increase();
54578 +               if (ret)
54579 +                       break;
54580 +       }
54581 +       if (i == 0)
54582 +               return ret;
54583 +
54584 +       tap_blkif_interface_init();
54585 +
54586 +       alloc_pending_reqs = 0;
54587 +
54588 +       tap_blkif_xenbus_init();
54589 +
54590 +       /* Dynamically allocate a major for this device */
54591 +       ret = register_chrdev(0, "blktap", &blktap_fops);
54592 +       blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
54593 +
54594 +       if ( (ret < 0)||(blktap_dir < 0) ) {
54595 +               WPRINTK("Couldn't register /dev/xen/blktap\n");
54596 +               return -ENOMEM;
54597 +       }       
54598 +       
54599 +       blktap_major = ret;
54600 +
54601 +       /* tapfds[0] is always NULL */
54602 +       blktap_next_minor++;
54603 +
54604 +       ret = devfs_mk_cdev(MKDEV(blktap_major, i),
54605 +                           S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
54606 +
54607 +       if(ret != 0)
54608 +               return -ENOMEM;
54609 +
54610 +       DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
54611 +
54612 +       /* Make sure the xen class exists */
54613 +       if (!setup_xen_class()) {
54614 +               /*
54615 +                * This will allow udev to create the blktap ctrl device.
54616 +                * We only want to create blktap0 first.  We don't want
54617 +                * to flood the sysfs system with needless blktap devices.
54618 +                * We only create the device when a request of a new device is
54619 +                * made.
54620 +                */
54621 +               class_device_create(xen_class, NULL,
54622 +                                   MKDEV(blktap_major, 0), NULL,
54623 +                                   "blktap0");
54624 +       } else {
54625 +               /* this is bad, but not fatal */
54626 +               WPRINTK("blktap: sysfs xen_class not created\n");
54627 +       }
54628 +
54629 +       DPRINTK("Blktap device successfully created\n");
54630 +
54631 +       return 0;
54632 +}
54633 +
54634 +module_init(blkif_init);
54635 +
54636 +MODULE_LICENSE("Dual BSD/GPL");
54637 diff -ruNp linux-2.6.19/drivers/xen/blktap/blktapmain.c linux-2.6.19-xen-3.0.4/drivers/xen/blktap/blktapmain.c
54638 --- linux-2.6.19/drivers/xen/blktap/blktapmain.c        1970-01-01 00:00:00.000000000 +0000
54639 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blktap/blktapmain.c      2007-02-02 19:10:45.000000000 +0000
54640 @@ -0,0 +1,1507 @@
54641 +/******************************************************************************
54642 + * drivers/xen/blktap/blktap.c
54643 + * 
54644 + * Back-end driver for user level virtual block devices. This portion of the
54645 + * driver exports a 'unified' block-device interface that can be accessed
54646 + * by any operating system that implements a compatible front end. Requests
54647 + * are remapped to a user-space memory region.
54648 + *
54649 + * Based on the blkback driver code.
54650 + * 
54651 + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
54652 + *
54653 + * Clean ups and fix ups:
54654 + *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
54655 + *
54656 + * This program is free software; you can redistribute it and/or
54657 + * modify it under the terms of the GNU General Public License version 2
54658 + * as published by the Free Software Foundation; or, when distributed
54659 + * separately from the Linux kernel or incorporated into other
54660 + * software packages, subject to the following license:
54661 + * 
54662 + * Permission is hereby granted, free of charge, to any person obtaining a copy
54663 + * of this source file (the "Software"), to deal in the Software without
54664 + * restriction, including without limitation the rights to use, copy, modify,
54665 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
54666 + * and to permit persons to whom the Software is furnished to do so, subject to
54667 + * the following conditions:
54668 + * 
54669 + * The above copyright notice and this permission notice shall be included in
54670 + * all copies or substantial portions of the Software.
54671 + * 
54672 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
54673 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
54674 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
54675 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
54676 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54677 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
54678 + * IN THE SOFTWARE.
54679 + */
54680 +
54681 +#include <linux/spinlock.h>
54682 +#include <linux/kthread.h>
54683 +#include <linux/list.h>
54684 +#include <asm/hypervisor.h>
54685 +#include "common.h"
54686 +#include <xen/balloon.h>
54687 +#include <linux/kernel.h>
54688 +#include <linux/fs.h>
54689 +#include <linux/mm.h>
54690 +#include <linux/errno.h>
54691 +#include <linux/major.h>
54692 +#include <linux/gfp.h>
54693 +#include <linux/poll.h>
54694 +#include <asm/tlbflush.h>
54695 +
54696 +#define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
54697 +#define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
54698 +
54699 +
54700 +struct class *xen_class;
54701 +EXPORT_SYMBOL_GPL(xen_class);
54702 +
54703 +/*
54704 + * Setup the xen class.  This should probably go in another file, but
54705 + * since blktap is the only user of it so far, it gets to keep it.
54706 + */
54707 +int setup_xen_class(void)
54708 +{
54709 +       int ret;
54710 +
54711 +       if (xen_class)
54712 +               return 0;
54713 +
54714 +       xen_class = class_create(THIS_MODULE, "xen");
54715 +       if ((ret = IS_ERR(xen_class))) {
54716 +               xen_class = NULL;
54717 +               return ret;
54718 +       }
54719 +
54720 +       return 0;
54721 +}
54722 +
54723 +/*
54724 + * The maximum number of requests that can be outstanding at any time
54725 + * is determined by 
54726 + *
54727 + *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
54728 + *
54729 + * where mmap_alloc < MAX_DYNAMIC_MEM.
54730 + *
54731 + * TODO:
54732 + * mmap_alloc is initialised to 2 and should be adjustable on the fly via
54733 + * sysfs.
54734 + */
54735 +#define BLK_RING_SIZE          __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
54736 +#define MAX_DYNAMIC_MEM                BLK_RING_SIZE
54737 +#define MAX_PENDING_REQS       BLK_RING_SIZE
54738 +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
54739 +#define MMAP_VADDR(_start, _req,_seg)                                   \
54740 +        (_start +                                                       \
54741 +         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
54742 +         ((_seg) * PAGE_SIZE))
54743 +static int blkif_reqs = MAX_PENDING_REQS;
54744 +static int mmap_pages = MMAP_PAGES;
54745 +
54746 +#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
54747 +                     * have a bunch of pages reserved for shared
54748 +                     * memory rings.
54749 +                     */
54750 +
54751 +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
54752 +typedef struct domid_translate {
54753 +       unsigned short domid;
54754 +       unsigned short busid;
54755 +} domid_translate_t ;
54756 +
54757 +/*Data struct associated with each of the tapdisk devices*/
54758 +typedef struct tap_blkif {
54759 +       struct vm_area_struct *vma;   /*Shared memory area                   */
54760 +       unsigned long rings_vstart;   /*Kernel memory mapping                */
54761 +       unsigned long user_vstart;    /*User memory mapping                  */
54762 +       unsigned long dev_inuse;      /*One process opens device at a time.  */
54763 +       unsigned long dev_pending;    /*In process of being opened           */
54764 +       unsigned long ring_ok;        /*make this ring->state                */
54765 +       blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
54766 +       wait_queue_head_t wait;       /*for poll                             */
54767 +       unsigned long mode;           /*current switching mode               */
54768 +       int minor;                    /*Minor number for tapdisk device      */
54769 +       pid_t pid;                    /*tapdisk process id                   */
54770 +       enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
54771 +                                                 shutdown                   */
54772 +       unsigned long *idx_map;       /*Record the user ring id to kern 
54773 +                                       [req id, idx] tuple                  */
54774 +       blkif_t *blkif;               /*Associate blkif with tapdev          */
54775 +       struct domid_translate trans; /*Translation from domid to bus.       */
54776 +} tap_blkif_t;
54777 +
54778 +static struct tap_blkif *tapfds[MAX_TAP_DEV];
54779 +static int blktap_next_minor;
54780 +
54781 +static int __init set_blkif_reqs(char *str)
54782 +{
54783 +       get_option(&str, &blkif_reqs);
54784 +       return 1;
54785 +}
54786 +__setup("blkif_reqs=", set_blkif_reqs);
54787 +
54788 +/* Run-time switchable: /sys/module/blktap/parameters/ */
54789 +static unsigned int log_stats = 0;
54790 +static unsigned int debug_lvl = 0;
54791 +module_param(log_stats, int, 0644);
54792 +module_param(debug_lvl, int, 0644);
54793 +
54794 +/*
54795 + * Each outstanding request that we've passed to the lower device layers has a 
54796 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
54797 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
54798 + * response queued for it, with the saved 'id' passed back.
54799 + */
54800 +typedef struct {
54801 +       blkif_t       *blkif;
54802 +       unsigned long  id;
54803 +       unsigned short mem_idx;
54804 +       int            nr_pages;
54805 +       atomic_t       pendcnt;
54806 +       unsigned short operation;
54807 +       int            status;
54808 +       struct list_head free_list;
54809 +       int            inuse;
54810 +} pending_req_t;
54811 +
54812 +static pending_req_t *pending_reqs[MAX_PENDING_REQS];
54813 +static struct list_head pending_free;
54814 +static DEFINE_SPINLOCK(pending_free_lock);
54815 +static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
54816 +static int alloc_pending_reqs;
54817 +
54818 +typedef unsigned int PEND_RING_IDX;
54819 +
54820 +static inline int MASK_PEND_IDX(int i) { 
54821 +       return (i & (MAX_PENDING_REQS-1));
54822 +}
54823 +
54824 +static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
54825 +       return (req - pending_reqs[idx]);
54826 +}
54827 +
54828 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
54829 +
54830 +#define BLKBACK_INVALID_HANDLE (~0)
54831 +
54832 +static struct page **foreign_pages[MAX_DYNAMIC_MEM];
54833 +static inline unsigned long idx_to_kaddr(
54834 +       unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
54835 +{
54836 +       unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
54837 +       unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
54838 +       return (unsigned long)pfn_to_kaddr(pfn);
54839 +}
54840 +
54841 +static unsigned short mmap_alloc = 0;
54842 +static unsigned short mmap_lock = 0;
54843 +static unsigned short mmap_inuse = 0;
54844 +
54845 +/******************************************************************
54846 + * GRANT HANDLES
54847 + */
54848 +
54849 +/* When using grant tables to map a frame for device access then the
54850 + * handle returned must be used to unmap the frame. This is needed to
54851 + * drop the ref count on the frame.
54852 + */
54853 +struct grant_handle_pair
54854 +{
54855 +        grant_handle_t kernel;
54856 +        grant_handle_t user;
54857 +};
54858 +#define INVALID_GRANT_HANDLE   0xFFFF
54859 +
54860 +static struct grant_handle_pair 
54861 +    pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
54862 +#define pending_handle(_id, _idx, _i) \
54863 +    (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
54864 +    + (_i)])
54865 +
54866 +
54867 +static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
54868 +
54869 +#define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
54870 +#define BLKTAP_DEV_DIR  "/dev/xen"
54871 +
54872 +static int blktap_major;
54873 +
54874 +/* blktap IOCTLs: */
54875 +#define BLKTAP_IOCTL_KICK_FE         1
54876 +#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
54877 +#define BLKTAP_IOCTL_SETMODE         3
54878 +#define BLKTAP_IOCTL_SENDPID        4
54879 +#define BLKTAP_IOCTL_NEWINTF        5
54880 +#define BLKTAP_IOCTL_MINOR          6
54881 +#define BLKTAP_IOCTL_MAJOR          7
54882 +#define BLKTAP_QUERY_ALLOC_REQS      8
54883 +#define BLKTAP_IOCTL_FREEINTF        9
54884 +#define BLKTAP_IOCTL_PRINT_IDXS      100  
54885 +
54886 +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
54887 +#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
54888 +#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
54889 +#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
54890 +
54891 +#define BLKTAP_MODE_INTERPOSE \
54892 +           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
54893 +
54894 +
54895 +static inline int BLKTAP_MODE_VALID(unsigned long arg)
54896 +{
54897 +       return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
54898 +               (arg == BLKTAP_MODE_INTERCEPT_FE) ||
54899 +                (arg == BLKTAP_MODE_INTERPOSE   ));
54900 +}
54901 +
54902 +/* Requests passing through the tap to userspace are re-assigned an ID.
54903 + * We must record a mapping between the BE [IDX,ID] tuple and the userspace
54904 + * ring ID. 
54905 + */
54906 +
54907 +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
54908 +{
54909 +        return ((fe_dom << 16) | MASK_PEND_IDX(idx));
54910 +}
54911 +
54912 +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
54913 +{
54914 +        return (PEND_RING_IDX)(id & 0x0000ffff);
54915 +}
54916 +
54917 +extern inline int ID_TO_MIDX(unsigned long id)
54918 +{
54919 +        return (int)(id >> 16);
54920 +}
54921 +
54922 +#define INVALID_REQ 0xdead0000
54923 +
54924 +/*TODO: Convert to a free list*/
54925 +static inline int GET_NEXT_REQ(unsigned long *idx_map)
54926 +{
54927 +       int i;
54928 +       for (i = 0; i < MAX_PENDING_REQS; i++)
54929 +               if (idx_map[i] == INVALID_REQ)
54930 +                       return i;
54931 +
54932 +       return INVALID_REQ;
54933 +}
54934 +
54935 +
54936 +#define BLKTAP_INVALID_HANDLE(_g) \
54937 +    (((_g->kernel) == INVALID_GRANT_HANDLE) &&  \
54938 +     ((_g->user) == INVALID_GRANT_HANDLE))
54939 +
54940 +#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
54941 +    (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
54942 +    } while(0)
54943 +
54944 +
54945 +/******************************************************************
54946 + * BLKTAP VM OPS
54947 + */
54948 +
54949 +static struct page *blktap_nopage(struct vm_area_struct *vma,
54950 +                                 unsigned long address,
54951 +                                 int *type)
54952 +{
54953 +       /*
54954 +        * if the page has not been mapped in by the driver then return
54955 +        * NOPAGE_SIGBUS to the domain.
54956 +        */
54957 +
54958 +       return NOPAGE_SIGBUS;
54959 +}
54960 +
54961 +struct vm_operations_struct blktap_vm_ops = {
54962 +       nopage:   blktap_nopage,
54963 +};
54964 +
54965 +/******************************************************************
54966 + * BLKTAP FILE OPS
54967 + */
54968
54969 +/*Function Declarations*/
54970 +static tap_blkif_t *get_next_free_dev(void);
54971 +static int blktap_open(struct inode *inode, struct file *filp);
54972 +static int blktap_release(struct inode *inode, struct file *filp);
54973 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
54974 +static int blktap_ioctl(struct inode *inode, struct file *filp,
54975 +                        unsigned int cmd, unsigned long arg);
54976 +static unsigned int blktap_poll(struct file *file, poll_table *wait);
54977 +
54978 +static struct file_operations blktap_fops = {
54979 +       .owner   = THIS_MODULE,
54980 +       .poll    = blktap_poll,
54981 +       .ioctl   = blktap_ioctl,
54982 +       .open    = blktap_open,
54983 +       .release = blktap_release,
54984 +       .mmap    = blktap_mmap,
54985 +};
54986 +
54987 +
54988 +static tap_blkif_t *get_next_free_dev(void)
54989 +{
54990 +       tap_blkif_t *info;
54991 +       int minor;
54992 +
54993 +       /*
54994 +        * This is called only from the ioctl, which
54995 +        * means we should always have interrupts enabled.
54996 +        */
54997 +       BUG_ON(irqs_disabled());
54998 +
54999 +       spin_lock_irq(&pending_free_lock);
55000 +
55001 +       /* tapfds[0] is always NULL */
55002 +
55003 +       for (minor = 1; minor < blktap_next_minor; minor++) {
55004 +               info = tapfds[minor];
55005 +               /* we could have failed a previous attempt. */
55006 +               if (!info ||
55007 +                   ((info->dev_inuse == 0) &&
55008 +                    (info->dev_pending == 0)) ) {
55009 +                       info->dev_pending = 1;
55010 +                       goto found;
55011 +               }
55012 +       }
55013 +       info = NULL;
55014 +       minor = -1;
55015 +
55016 +       /*
55017 +        * We didn't find free device. If we can still allocate
55018 +        * more, then we grab the next device minor that is
55019 +        * available.  This is done while we are still under
55020 +        * the protection of the pending_free_lock.
55021 +        */
55022 +       if (blktap_next_minor < MAX_TAP_DEV)
55023 +               minor = blktap_next_minor++;
55024 +found:
55025 +       spin_unlock_irq(&pending_free_lock);
55026 +
55027 +       if (!info && minor > 0) {
55028 +               info = kzalloc(sizeof(*info), GFP_KERNEL);
55029 +               if (unlikely(!info)) {
55030 +                       /*
55031 +                        * If we failed here, try to put back
55032 +                        * the next minor number. But if one
55033 +                        * was just taken, then we just lose this
55034 +                        * minor.  We can try to allocate this
55035 +                        * minor again later.
55036 +                        */
55037 +                       spin_lock_irq(&pending_free_lock);
55038 +                       if (blktap_next_minor == minor+1)
55039 +                               blktap_next_minor--;
55040 +                       spin_unlock_irq(&pending_free_lock);
55041 +                       goto out;
55042 +               }
55043 +
55044 +               info->minor = minor;
55045 +               /*
55046 +                * Make sure that we have a minor before others can
55047 +                * see us.
55048 +                */
55049 +               wmb();
55050 +               tapfds[minor] = info;
55051 +
55052 +               class_device_create(xen_class, NULL,
55053 +                                   MKDEV(blktap_major, minor), NULL,
55054 +                                   "blktap%d", minor);
55055 +       }
55056 +
55057 +out:
55058 +       return info;
55059 +}
55060 +
55061 +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
55062 +{
55063 +       tap_blkif_t *info;
55064 +       int i;
55065 +
55066 +       for (i = 1; i < blktap_next_minor; i++) {
55067 +               info = tapfds[i];
55068 +               if ( info &&
55069 +                    (info->trans.domid == domid) &&
55070 +                    (info->trans.busid == xenbus_id) ) {
55071 +                       info->blkif = blkif;
55072 +                       info->status = RUNNING;
55073 +                       return i;
55074 +               }
55075 +       }
55076 +       return -1;
55077 +}
55078 +
55079 +void signal_tapdisk(int idx) 
55080 +{
55081 +       tap_blkif_t *info;
55082 +       struct task_struct *ptask;
55083 +
55084 +       info = tapfds[idx];
55085 +       if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
55086 +               return;
55087 +
55088 +       if (info->pid > 0) {
55089 +               ptask = find_task_by_pid(info->pid);
55090 +               if (ptask)
55091 +                       info->status = CLEANSHUTDOWN;
55092 +       }
55093 +       info->blkif = NULL;
55094 +
55095 +       return;
55096 +}
55097 +
55098 +static int blktap_open(struct inode *inode, struct file *filp)
55099 +{
55100 +       blkif_sring_t *sring;
55101 +       int idx = iminor(inode) - BLKTAP_MINOR;
55102 +       tap_blkif_t *info;
55103 +       int i;
55104 +       
55105 +       /* ctrl device, treat differently */
55106 +       if (!idx)
55107 +               return 0;
55108 +
55109 +       info = tapfds[idx];
55110 +
55111 +       if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
55112 +               WPRINTK("Unable to open device /dev/xen/blktap%d\n",
55113 +                       idx);
55114 +               return -ENODEV;
55115 +       }
55116 +
55117 +       DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
55118 +       
55119 +       /*Only one process can access device at a time*/
55120 +       if (test_and_set_bit(0, &info->dev_inuse))
55121 +               return -EBUSY;
55122 +
55123 +       info->dev_pending = 0;
55124 +           
55125 +       /* Allocate the fe ring. */
55126 +       sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
55127 +       if (sring == NULL)
55128 +               goto fail_nomem;
55129 +
55130 +       SetPageReserved(virt_to_page(sring));
55131 +    
55132 +       SHARED_RING_INIT(sring);
55133 +       FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
55134 +       
55135 +       filp->private_data = info;
55136 +       info->vma = NULL;
55137 +
55138 +       info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
55139 +                               GFP_KERNEL);
55140 +       
55141 +       if (idx > 0) {
55142 +               init_waitqueue_head(&info->wait);
55143 +               for (i = 0; i < MAX_PENDING_REQS; i++) 
55144 +                       info->idx_map[i] = INVALID_REQ;
55145 +       }
55146 +
55147 +       DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
55148 +       return 0;
55149 +
55150 + fail_nomem:
55151 +       return -ENOMEM;
55152 +}
55153 +
55154 +static int blktap_release(struct inode *inode, struct file *filp)
55155 +{
55156 +       tap_blkif_t *info = filp->private_data;
55157 +       
55158 +       /* check for control device */
55159 +       if (!info)
55160 +               return 0;
55161 +
55162 +       info->dev_inuse = 0;
55163 +       DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
55164 +
55165 +       /* Free the ring page. */
55166 +       ClearPageReserved(virt_to_page(info->ufe_ring.sring));
55167 +       free_page((unsigned long) info->ufe_ring.sring);
55168 +
55169 +       /* Clear any active mappings and free foreign map table */
55170 +       if (info->vma) {
55171 +               zap_page_range(
55172 +                       info->vma, info->vma->vm_start, 
55173 +                       info->vma->vm_end - info->vma->vm_start, NULL);
55174 +               info->vma = NULL;
55175 +       }
55176 +       
55177 +       if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
55178 +               if (info->blkif->xenblkd != NULL) {
55179 +                       kthread_stop(info->blkif->xenblkd);
55180 +                       info->blkif->xenblkd = NULL;
55181 +               }
55182 +               info->status = CLEANSHUTDOWN;
55183 +       }       
55184 +       return 0;
55185 +}
55186 +
55187 +
55188 +/* Note on mmap:
55189 + * We need to map pages to user space in a way that will allow the block
55190 + * subsystem set up direct IO to them.  This couldn't be done before, because
55191 + * there isn't really a sane way to translate a user virtual address down to a 
55192 + * physical address when the page belongs to another domain.
55193 + *
55194 + * My first approach was to map the page in to kernel memory, add an entry
55195 + * for it in the physical frame list (using alloc_lomem_region as in blkback)
55196 + * and then attempt to map that page up to user space.  This is disallowed
55197 + * by xen though, which realizes that we don't really own the machine frame
55198 + * underlying the physical page.
55199 + *
55200 + * The new approach is to provide explicit support for this in xen linux.
55201 + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
55202 + * mapped from other vms.  vma->vm_private_data is set up as a mapping 
55203 + * from pages to actual page structs.  There is a new clause in get_user_pages
55204 + * that does the right thing for this sort of mapping.
55205 + */
55206 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
55207 +{
55208 +       int size;
55209 +       struct page **map;
55210 +       int i;
55211 +       tap_blkif_t *info = filp->private_data;
55212 +
55213 +       if (info == NULL) {
55214 +               WPRINTK("blktap: mmap, retrieving idx failed\n");
55215 +               return -ENOMEM;
55216 +       }
55217 +       
55218 +       vma->vm_flags |= VM_RESERVED;
55219 +       vma->vm_ops = &blktap_vm_ops;
55220 +
55221 +       size = vma->vm_end - vma->vm_start;
55222 +       if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
55223 +               WPRINTK("you _must_ map exactly %d pages!\n",
55224 +                      mmap_pages + RING_PAGES);
55225 +               return -EAGAIN;
55226 +       }
55227 +
55228 +       size >>= PAGE_SHIFT;
55229 +       info->rings_vstart = vma->vm_start;
55230 +       info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
55231 +    
55232 +       /* Map the ring pages to the start of the region and reserve it. */
55233 +       if (remap_pfn_range(vma, vma->vm_start, 
55234 +                           __pa(info->ufe_ring.sring) >> PAGE_SHIFT, 
55235 +                           PAGE_SIZE, vma->vm_page_prot)) {
55236 +               WPRINTK("Mapping user ring failed!\n");
55237 +               goto fail;
55238 +       }
55239 +
55240 +       /* Mark this VM as containing foreign pages, and set up mappings. */
55241 +       map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
55242 +                     * sizeof(struct page_struct*),
55243 +                     GFP_KERNEL);
55244 +       if (map == NULL) {
55245 +               WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
55246 +               goto fail;
55247 +       }
55248 +
55249 +       for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
55250 +               map[i] = NULL;
55251 +    
55252 +       vma->vm_private_data = map;
55253 +       vma->vm_flags |= VM_FOREIGN;
55254 +
55255 +       info->vma = vma;
55256 +       info->ring_ok = 1;
55257 +       return 0;
55258 + fail:
55259 +       /* Clear any active mappings. */
55260 +       zap_page_range(vma, vma->vm_start, 
55261 +                      vma->vm_end - vma->vm_start, NULL);
55262 +
55263 +       return -ENOMEM;
55264 +}
55265 +
55266 +
55267 +static int blktap_ioctl(struct inode *inode, struct file *filp,
55268 +                        unsigned int cmd, unsigned long arg)
55269 +{
55270 +       tap_blkif_t *info = filp->private_data;
55271 +
55272 +       switch(cmd) {
55273 +       case BLKTAP_IOCTL_KICK_FE: 
55274 +       {
55275 +               /* There are fe messages to process. */
55276 +               return blktap_read_ufe_ring(info);
55277 +       }
55278 +       case BLKTAP_IOCTL_SETMODE:
55279 +       {
55280 +               if (info) {
55281 +                       if (BLKTAP_MODE_VALID(arg)) {
55282 +                               info->mode = arg;
55283 +                               /* XXX: may need to flush rings here. */
55284 +                               DPRINTK("blktap: set mode to %lx\n", 
55285 +                                      arg);
55286 +                               return 0;
55287 +                       }
55288 +               }
55289 +               return 0;
55290 +       }
55291 +       case BLKTAP_IOCTL_PRINT_IDXS:
55292 +        {
55293 +               if (info) {
55294 +                       printk("User Rings: \n-----------\n");
55295 +                       printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
55296 +                               "| req_prod: %2d, rsp_prod: %2d\n",
55297 +                               info->ufe_ring.rsp_cons,
55298 +                               info->ufe_ring.req_prod_pvt,
55299 +                               info->ufe_ring.sring->req_prod,
55300 +                               info->ufe_ring.sring->rsp_prod);
55301 +               }
55302 +               return 0;
55303 +        }
55304 +       case BLKTAP_IOCTL_SENDPID:
55305 +       {
55306 +               if (info) {
55307 +                       info->pid = (pid_t)arg;
55308 +                       DPRINTK("blktap: pid received %d\n", 
55309 +                              info->pid);
55310 +               }
55311 +               return 0;
55312 +       }
55313 +       case BLKTAP_IOCTL_NEWINTF:
55314 +       {               
55315 +               uint64_t val = (uint64_t)arg;
55316 +               domid_translate_t *tr = (domid_translate_t *)&val;
55317 +
55318 +               DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
55319 +                      tr->domid, tr->busid);
55320 +               info = get_next_free_dev();
55321 +               if (!info) {
55322 +                       WPRINTK("Error initialising /dev/xen/blktap - "
55323 +                               "No more devices\n");
55324 +                       return -1;
55325 +               }
55326 +               info->trans.domid = tr->domid;
55327 +               info->trans.busid = tr->busid;
55328 +               return info->minor;
55329 +       }
55330 +       case BLKTAP_IOCTL_FREEINTF:
55331 +       {
55332 +               unsigned long dev = arg;
55333 +               unsigned long flags;
55334 +
55335 +               info = tapfds[dev];
55336 +
55337 +               if ((dev > MAX_TAP_DEV) || !info)
55338 +                       return 0; /* should this be an error? */
55339 +
55340 +               spin_lock_irqsave(&pending_free_lock, flags);
55341 +               if (info->dev_pending)
55342 +                       info->dev_pending = 0;
55343 +               spin_unlock_irqrestore(&pending_free_lock, flags);
55344 +
55345 +               return 0;
55346 +       }
55347 +       case BLKTAP_IOCTL_MINOR:
55348 +       {
55349 +               unsigned long dev = arg;
55350 +
55351 +               info = tapfds[dev];
55352 +
55353 +               if ((dev > MAX_TAP_DEV) || !info)
55354 +                       return -EINVAL;
55355 +
55356 +               return info->minor;
55357 +       }
55358 +       case BLKTAP_IOCTL_MAJOR:
55359 +               return blktap_major;
55360 +
55361 +       case BLKTAP_QUERY_ALLOC_REQS:
55362 +       {
55363 +               WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
55364 +                      alloc_pending_reqs, blkif_reqs);
55365 +               return (alloc_pending_reqs/blkif_reqs) * 100;
55366 +       }
55367 +       }
55368 +       return -ENOIOCTLCMD;
55369 +}
55370 +
55371 +static unsigned int blktap_poll(struct file *filp, poll_table *wait)
55372 +{
55373 +       tap_blkif_t *info = filp->private_data;
55374 +       
55375 +       /* do not work on the control device */
55376 +       if (!info)
55377 +               return 0;
55378 +
55379 +       poll_wait(filp, &info->wait, wait);
55380 +       if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
55381 +               RING_PUSH_REQUESTS(&info->ufe_ring);
55382 +               return POLLIN | POLLRDNORM;
55383 +       }
55384 +       return 0;
55385 +}
55386 +
55387 +void blktap_kick_user(int idx)
55388 +{
55389 +       tap_blkif_t *info;
55390 +
55391 +       info = tapfds[idx];
55392 +
55393 +       if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
55394 +               return;
55395 +
55396 +       wake_up_interruptible(&info->wait);
55397 +
55398 +       return;
55399 +}
55400 +
55401 +static int do_block_io_op(blkif_t *blkif);
55402 +static void dispatch_rw_block_io(blkif_t *blkif,
55403 +                                blkif_request_t *req,
55404 +                                pending_req_t *pending_req);
55405 +static void make_response(blkif_t *blkif, unsigned long id, 
55406 +                          unsigned short op, int st);
55407 +
55408 +/******************************************************************
55409 + * misc small helpers
55410 + */
55411 +static int req_increase(void)
55412 +{
55413 +       int i, j;
55414 +
55415 +       if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
55416 +               return -EINVAL;
55417 +
55418 +       pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
55419 +                                           * blkif_reqs, GFP_KERNEL);
55420 +       foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
55421 +
55422 +       if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
55423 +               goto out_of_memory;
55424 +
55425 +       DPRINTK("%s: reqs=%d, pages=%d\n",
55426 +               __FUNCTION__, blkif_reqs, mmap_pages);
55427 +
55428 +       for (i = 0; i < MAX_PENDING_REQS; i++) {
55429 +               list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
55430 +                             &pending_free);
55431 +               pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
55432 +               for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
55433 +                       BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
55434 +                                                                i, j));
55435 +       }
55436 +
55437 +       mmap_alloc++;
55438 +       DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
55439 +       return 0;
55440 +
55441 + out_of_memory:
55442 +       free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
55443 +       kfree(pending_reqs[mmap_alloc]);
55444 +       WPRINTK("%s: out of memory\n", __FUNCTION__);
55445 +       return -ENOMEM;
55446 +}
55447 +
55448 +static void mmap_req_del(int mmap)
55449 +{
55450 +       BUG_ON(!spin_is_locked(&pending_free_lock));
55451 +
55452 +       kfree(pending_reqs[mmap]);
55453 +       pending_reqs[mmap] = NULL;
55454 +
55455 +       free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
55456 +       foreign_pages[mmap] = NULL;
55457 +
55458 +       mmap_lock = 0;
55459 +       DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
55460 +       mmap_alloc--;
55461 +}
55462 +
55463 +static pending_req_t* alloc_req(void)
55464 +{
55465 +       pending_req_t *req = NULL;
55466 +       unsigned long flags;
55467 +
55468 +       spin_lock_irqsave(&pending_free_lock, flags);
55469 +
55470 +       if (!list_empty(&pending_free)) {
55471 +               req = list_entry(pending_free.next, pending_req_t, free_list);
55472 +               list_del(&req->free_list);
55473 +       }
55474 +
55475 +       if (req) {
55476 +               req->inuse = 1;
55477 +               alloc_pending_reqs++;
55478 +       }
55479 +       spin_unlock_irqrestore(&pending_free_lock, flags);
55480 +
55481 +       return req;
55482 +}
55483 +
55484 +static void free_req(pending_req_t *req)
55485 +{
55486 +       unsigned long flags;
55487 +       int was_empty;
55488 +
55489 +       spin_lock_irqsave(&pending_free_lock, flags);
55490 +
55491 +       alloc_pending_reqs--;
55492 +       req->inuse = 0;
55493 +       if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
55494 +               mmap_inuse--;
55495 +               if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
55496 +               spin_unlock_irqrestore(&pending_free_lock, flags);
55497 +               return;
55498 +       }
55499 +       was_empty = list_empty(&pending_free);
55500 +       list_add(&req->free_list, &pending_free);
55501 +
55502 +       spin_unlock_irqrestore(&pending_free_lock, flags);
55503 +
55504 +       if (was_empty)
55505 +               wake_up(&pending_free_wq);
55506 +}
55507 +
55508 +static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
55509 +                           int tapidx)
55510 +{
55511 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
55512 +       unsigned int i, invcount = 0;
55513 +       struct grant_handle_pair *khandle;
55514 +       uint64_t ptep;
55515 +       int ret, mmap_idx;
55516 +       unsigned long kvaddr, uvaddr;
55517 +       tap_blkif_t *info;
55518 +       
55519 +
55520 +       info = tapfds[tapidx];
55521 +
55522 +       if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
55523 +               WPRINTK("fast_flush: Couldn't get info!\n");
55524 +               return;
55525 +       }
55526 +
55527 +       if (info->vma != NULL &&
55528 +           xen_feature(XENFEAT_auto_translated_physmap)) {
55529 +               down_write(&info->vma->vm_mm->mmap_sem);
55530 +               zap_page_range(info->vma, 
55531 +                              MMAP_VADDR(info->user_vstart, u_idx, 0), 
55532 +                              req->nr_pages << PAGE_SHIFT, NULL);
55533 +               up_write(&info->vma->vm_mm->mmap_sem);
55534 +       }
55535 +
55536 +       mmap_idx = req->mem_idx;
55537 +
55538 +       for (i = 0; i < req->nr_pages; i++) {
55539 +               kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
55540 +               uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
55541 +
55542 +               khandle = &pending_handle(mmap_idx, k_idx, i);
55543 +
55544 +               if (khandle->kernel != INVALID_GRANT_HANDLE) {
55545 +                       gnttab_set_unmap_op(&unmap[invcount],
55546 +                                           idx_to_kaddr(mmap_idx, k_idx, i),
55547 +                                           GNTMAP_host_map, khandle->kernel);
55548 +                       invcount++;
55549 +               }
55550 +
55551 +               if (khandle->user != INVALID_GRANT_HANDLE) {
55552 +                       BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
55553 +                       if (create_lookup_pte_addr(
55554 +                               info->vma->vm_mm,
55555 +                               MMAP_VADDR(info->user_vstart, u_idx, i),
55556 +                               &ptep) !=0) {
55557 +                               WPRINTK("Couldn't get a pte addr!\n");
55558 +                               return;
55559 +                       }
55560 +
55561 +                       gnttab_set_unmap_op(&unmap[invcount], ptep,
55562 +                                           GNTMAP_host_map
55563 +                                           | GNTMAP_application_map
55564 +                                           | GNTMAP_contains_pte,
55565 +                                           khandle->user);
55566 +                       invcount++;
55567 +               }
55568 +
55569 +               BLKTAP_INVALIDATE_HANDLE(khandle);
55570 +       }
55571 +       ret = HYPERVISOR_grant_table_op(
55572 +               GNTTABOP_unmap_grant_ref, unmap, invcount);
55573 +       BUG_ON(ret);
55574 +       
55575 +       if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
55576 +               zap_page_range(info->vma, 
55577 +                              MMAP_VADDR(info->user_vstart, u_idx, 0), 
55578 +                              req->nr_pages << PAGE_SHIFT, NULL);
55579 +}
55580 +
55581 +/******************************************************************
55582 + * SCHEDULER FUNCTIONS
55583 + */
55584 +
55585 +static void print_stats(blkif_t *blkif)
55586 +{
55587 +       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
55588 +              current->comm, blkif->st_oo_req,
55589 +              blkif->st_rd_req, blkif->st_wr_req);
55590 +       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
55591 +       blkif->st_rd_req = 0;
55592 +       blkif->st_wr_req = 0;
55593 +       blkif->st_oo_req = 0;
55594 +}
55595 +
55596 +int tap_blkif_schedule(void *arg)
55597 +{
55598 +       blkif_t *blkif = arg;
55599 +
55600 +       blkif_get(blkif);
55601 +
55602 +       if (debug_lvl)
55603 +               printk(KERN_DEBUG "%s: started\n", current->comm);
55604 +
55605 +       while (!kthread_should_stop()) {
55606 +               wait_event_interruptible(
55607 +                       blkif->wq,
55608 +                       blkif->waiting_reqs || kthread_should_stop());
55609 +               wait_event_interruptible(
55610 +                       pending_free_wq,
55611 +                       !list_empty(&pending_free) || kthread_should_stop());
55612 +
55613 +               blkif->waiting_reqs = 0;
55614 +               smp_mb(); /* clear flag *before* checking for work */
55615 +
55616 +               if (do_block_io_op(blkif))
55617 +                       blkif->waiting_reqs = 1;
55618 +
55619 +               if (log_stats && time_after(jiffies, blkif->st_print))
55620 +                       print_stats(blkif);
55621 +       }
55622 +
55623 +       if (log_stats)
55624 +               print_stats(blkif);
55625 +       if (debug_lvl)
55626 +               printk(KERN_DEBUG "%s: exiting\n", current->comm);
55627 +
55628 +       blkif->xenblkd = NULL;
55629 +       blkif_put(blkif);
55630 +
55631 +       return 0;
55632 +}
55633 +
55634 +/******************************************************************
55635 + * COMPLETION CALLBACK -- Called by user level ioctl()
55636 + */
55637 +
55638 +static int blktap_read_ufe_ring(tap_blkif_t *info)
55639 +{
55640 +       /* This is called to read responses from the UFE ring. */
55641 +       RING_IDX i, j, rp;
55642 +       blkif_response_t *resp;
55643 +       blkif_t *blkif=NULL;
55644 +       int pending_idx, usr_idx, mmap_idx;
55645 +       pending_req_t *pending_req;
55646 +       
55647 +       if (!info)
55648 +               return 0;
55649 +
55650 +       /* We currently only forward packets in INTERCEPT_FE mode. */
55651 +       if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
55652 +               return 0;
55653 +
55654 +       /* for each outstanding message on the UFEring  */
55655 +       rp = info->ufe_ring.sring->rsp_prod;
55656 +       rmb();
55657 +        
55658 +       for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
55659 +               blkif_response_t res;
55660 +               resp = RING_GET_RESPONSE(&info->ufe_ring, i);
55661 +               memcpy(&res, resp, sizeof(res));
55662 +               mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
55663 +               ++info->ufe_ring.rsp_cons;
55664 +
55665 +               /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
55666 +               usr_idx = (int)res.id;
55667 +               pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
55668 +               mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
55669 +
55670 +               if ( (mmap_idx >= mmap_alloc) || 
55671 +                  (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
55672 +                       WPRINTK("Incorrect req map"
55673 +                              "[%d], internal map [%d,%d (%d)]\n", 
55674 +                              usr_idx, mmap_idx, 
55675 +                              ID_TO_IDX(info->idx_map[usr_idx]),
55676 +                              MASK_PEND_IDX(
55677 +                                      ID_TO_IDX(info->idx_map[usr_idx])));
55678 +
55679 +               pending_req = &pending_reqs[mmap_idx][pending_idx];
55680 +               blkif = pending_req->blkif;
55681 +
55682 +               for (j = 0; j < pending_req->nr_pages; j++) {
55683 +
55684 +                       unsigned long kvaddr, uvaddr;
55685 +                       struct page **map = info->vma->vm_private_data;
55686 +                       struct page *pg;
55687 +                       int offset;
55688 +
55689 +                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
55690 +                       kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
55691 +
55692 +                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
55693 +                       ClearPageReserved(pg);
55694 +                       offset = (uvaddr - info->vma->vm_start) 
55695 +                               >> PAGE_SHIFT;
55696 +                       map[offset] = NULL;
55697 +               }
55698 +               fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
55699 +               make_response(blkif, pending_req->id, res.operation,
55700 +                             res.status);
55701 +               info->idx_map[usr_idx] = INVALID_REQ;
55702 +               blkif_put(pending_req->blkif);
55703 +               free_req(pending_req);
55704 +       }
55705 +               
55706 +       return 0;
55707 +}
55708 +
55709 +
55710 +/******************************************************************************
55711 + * NOTIFICATION FROM GUEST OS.
55712 + */
55713 +
55714 +static void blkif_notify_work(blkif_t *blkif)
55715 +{
55716 +       blkif->waiting_reqs = 1;
55717 +       wake_up(&blkif->wq);
55718 +}
55719 +
55720 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
55721 +{
55722 +       blkif_notify_work(dev_id);
55723 +       return IRQ_HANDLED;
55724 +}
55725 +
55726 +
55727 +
55728 +/******************************************************************
55729 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
55730 + */
55731 +static int print_dbug = 1;
55732 +static int do_block_io_op(blkif_t *blkif)
55733 +{
55734 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
55735 +       blkif_request_t req;
55736 +       pending_req_t *pending_req;
55737 +       RING_IDX rc, rp;
55738 +       int more_to_do = 0;
55739 +       tap_blkif_t *info;
55740 +
55741 +       rc = blk_ring->req_cons;
55742 +       rp = blk_ring->sring->req_prod;
55743 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
55744 +
55745 +       /*Check blkif has corresponding UE ring*/
55746 +       if (blkif->dev_num < 0) {
55747 +               /*oops*/
55748 +               if (print_dbug) {
55749 +                       WPRINTK("Corresponding UE " 
55750 +                              "ring does not exist!\n");
55751 +                       print_dbug = 0; /*We only print this message once*/
55752 +               }
55753 +               return 0;
55754 +       }
55755 +
55756 +       info = tapfds[blkif->dev_num];
55757 +
55758 +       if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
55759 +               if (print_dbug) {
55760 +                       WPRINTK("Can't get UE info!\n");
55761 +                       print_dbug = 0;
55762 +               }
55763 +               return 0;
55764 +       }
55765 +
55766 +       while (rc != rp) {
55767 +               
55768 +               if (RING_FULL(&info->ufe_ring)) {
55769 +                       WPRINTK("RING_FULL! More to do\n");
55770 +                       more_to_do = 1;
55771 +                       break;
55772 +               }
55773 +               
55774 +               if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
55775 +                       WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
55776 +                              " More to do\n");
55777 +                       more_to_do = 1;
55778 +                       break;          
55779 +               }
55780 +
55781 +               pending_req = alloc_req();
55782 +               if (NULL == pending_req) {
55783 +                       blkif->st_oo_req++;
55784 +                       more_to_do = 1;
55785 +                       break;
55786 +               }
55787 +
55788 +               memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
55789 +               blk_ring->req_cons = ++rc; /* before make_response() */ 
55790 +
55791 +               switch (req.operation) {
55792 +               case BLKIF_OP_READ:
55793 +                       blkif->st_rd_req++;
55794 +                       dispatch_rw_block_io(blkif, &req, pending_req);
55795 +                       break;
55796 +
55797 +               case BLKIF_OP_WRITE:
55798 +                       blkif->st_wr_req++;
55799 +                       dispatch_rw_block_io(blkif, &req, pending_req);
55800 +                       break;
55801 +
55802 +               default:
55803 +                       WPRINTK("unknown operation [%d]\n",
55804 +                               req.operation);
55805 +                       make_response(blkif, req.id, req.operation,
55806 +                                     BLKIF_RSP_ERROR);
55807 +                       free_req(pending_req);
55808 +                       break;
55809 +               }
55810 +       }
55811 +               
55812 +       blktap_kick_user(blkif->dev_num);
55813 +
55814 +       return more_to_do;
55815 +}
55816 +
55817 +static void dispatch_rw_block_io(blkif_t *blkif,
55818 +                                blkif_request_t *req,
55819 +                                pending_req_t *pending_req)
55820 +{
55821 +       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
55822 +       int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
55823 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
55824 +       unsigned int nseg;
55825 +       int ret, i;
55826 +       tap_blkif_t *info;
55827 +       uint64_t sector;
55828 +       blkif_request_t *target;
55829 +       int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
55830 +       int usr_idx;
55831 +       uint16_t mmap_idx = pending_req->mem_idx;
55832 +
55833 +       if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
55834 +               goto fail_response;
55835 +
55836 +       info = tapfds[blkif->dev_num];
55837 +       if (info == NULL)
55838 +               goto fail_response;
55839 +
55840 +       /* Check we have space on user ring - should never fail. */
55841 +       usr_idx = GET_NEXT_REQ(info->idx_map);
55842 +       if (usr_idx == INVALID_REQ) {
55843 +               BUG();
55844 +               goto fail_response;
55845 +       }
55846 +
55847 +       /* Check that number of segments is sane. */
55848 +       nseg = req->nr_segments;
55849 +       if ( unlikely(nseg == 0) || 
55850 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
55851 +               WPRINTK("Bad number of segments in request (%d)\n", nseg);
55852 +               goto fail_response;
55853 +       }
55854 +       
55855 +       /* Make sure userspace is ready. */
55856 +       if (!info->ring_ok) {
55857 +               WPRINTK("blktap: ring not ready for requests!\n");
55858 +               goto fail_response;
55859 +       }
55860 +
55861 +       if (RING_FULL(&info->ufe_ring)) {
55862 +               WPRINTK("blktap: fe_ring is full, can't add "
55863 +                       "IO Request will be dropped. %d %d\n",
55864 +                       RING_SIZE(&info->ufe_ring),
55865 +                       RING_SIZE(&blkif->blk_ring));
55866 +               goto fail_response;
55867 +       }
55868 +
55869 +       pending_req->blkif     = blkif;
55870 +       pending_req->id        = req->id;
55871 +       pending_req->operation = operation;
55872 +       pending_req->status    = BLKIF_RSP_OKAY;
55873 +       pending_req->nr_pages  = nseg;
55874 +       op = 0;
55875 +       for (i = 0; i < nseg; i++) {
55876 +               unsigned long uvaddr;
55877 +               unsigned long kvaddr;
55878 +               uint64_t ptep;
55879 +               uint32_t flags;
55880 +
55881 +               uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
55882 +               kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
55883 +
55884 +               sector = req->sector_number + ((PAGE_SIZE / 512) * i);
55885 +               if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
55886 +                       WPRINTK("BLKTAP: Sector request greater" 
55887 +                              "than size\n");
55888 +                       WPRINTK("BLKTAP: %s request sector" 
55889 +                              "[%llu,%llu], Total [%llu]\n",
55890 +                              (req->operation == 
55891 +                               BLKIF_OP_WRITE ? "WRITE" : "READ"),
55892 +                               (long long unsigned) sector,
55893 +                               (long long unsigned) sector>>9,
55894 +                               (long long unsigned) blkif->sectors);
55895 +               }
55896 +
55897 +               flags = GNTMAP_host_map;
55898 +               if (operation == WRITE)
55899 +                       flags |= GNTMAP_readonly;
55900 +               gnttab_set_map_op(&map[op], kvaddr, flags,
55901 +                                 req->seg[i].gref, blkif->domid);
55902 +               op++;
55903 +
55904 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
55905 +                       /* Now map it to user. */
55906 +                       ret = create_lookup_pte_addr(info->vma->vm_mm, 
55907 +                                                    uvaddr, &ptep);
55908 +                       if (ret) {
55909 +                               WPRINTK("Couldn't get a pte addr!\n");
55910 +                               goto fail_flush;
55911 +                       }
55912 +
55913 +                       flags = GNTMAP_host_map | GNTMAP_application_map
55914 +                               | GNTMAP_contains_pte;
55915 +                       if (operation == WRITE)
55916 +                               flags |= GNTMAP_readonly;
55917 +                       gnttab_set_map_op(&map[op], ptep, flags,
55918 +                                         req->seg[i].gref, blkif->domid);
55919 +                       op++;
55920 +               }
55921 +       }
55922 +
55923 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
55924 +       BUG_ON(ret);
55925 +
55926 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
55927 +               for (i = 0; i < (nseg*2); i+=2) {
55928 +                       unsigned long uvaddr;
55929 +                       unsigned long kvaddr;
55930 +                       unsigned long offset;
55931 +                       struct page *pg;
55932 +
55933 +                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
55934 +                       kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
55935 +
55936 +                       if (unlikely(map[i].status != 0)) {
55937 +                               WPRINTK("invalid kernel buffer -- "
55938 +                                       "could not remap it\n");
55939 +                               ret |= 1;
55940 +                               map[i].handle = INVALID_GRANT_HANDLE;
55941 +                       }
55942 +
55943 +                       if (unlikely(map[i+1].status != 0)) {
55944 +                               WPRINTK("invalid user buffer -- "
55945 +                                       "could not remap it\n");
55946 +                               ret |= 1;
55947 +                               map[i+1].handle = INVALID_GRANT_HANDLE;
55948 +                       }
55949 +
55950 +                       pending_handle(mmap_idx, pending_idx, i/2).kernel 
55951 +                               = map[i].handle;
55952 +                       pending_handle(mmap_idx, pending_idx, i/2).user   
55953 +                               = map[i+1].handle;
55954 +
55955 +                       if (ret)
55956 +                               continue;
55957 +
55958 +                       set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
55959 +                                           FOREIGN_FRAME(map[i].dev_bus_addr
55960 +                                                         >> PAGE_SHIFT));
55961 +                       offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
55962 +                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
55963 +                       ((struct page **)info->vma->vm_private_data)[offset] =
55964 +                               pg;
55965 +               }
55966 +       } else {
55967 +               for (i = 0; i < nseg; i++) {
55968 +                       unsigned long uvaddr;
55969 +                       unsigned long kvaddr;
55970 +                       unsigned long offset;
55971 +                       struct page *pg;
55972 +
55973 +                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
55974 +                       kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
55975 +
55976 +                       if (unlikely(map[i].status != 0)) {
55977 +                               WPRINTK("invalid kernel buffer -- "
55978 +                                       "could not remap it\n");
55979 +                               ret |= 1;
55980 +                               map[i].handle = INVALID_GRANT_HANDLE;
55981 +                       }
55982 +
55983 +                       pending_handle(mmap_idx, pending_idx, i).kernel 
55984 +                               = map[i].handle;
55985 +
55986 +                       if (ret)
55987 +                               continue;
55988 +
55989 +                       offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
55990 +                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
55991 +                       ((struct page **)info->vma->vm_private_data)[offset] =
55992 +                               pg;
55993 +               }
55994 +       }
55995 +
55996 +       if (ret)
55997 +               goto fail_flush;
55998 +
55999 +       if (xen_feature(XENFEAT_auto_translated_physmap))
56000 +               down_write(&info->vma->vm_mm->mmap_sem);
56001 +       /* Mark mapped pages as reserved: */
56002 +       for (i = 0; i < req->nr_segments; i++) {
56003 +               unsigned long kvaddr;
56004 +               struct page *pg;
56005 +
56006 +               kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
56007 +               pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
56008 +               SetPageReserved(pg);
56009 +               if (xen_feature(XENFEAT_auto_translated_physmap)) {
56010 +                       ret = vm_insert_page(info->vma,
56011 +                                            MMAP_VADDR(info->user_vstart,
56012 +                                                       usr_idx, i), pg);
56013 +                       if (ret) {
56014 +                               up_write(&info->vma->vm_mm->mmap_sem);
56015 +                               goto fail_flush;
56016 +                       }
56017 +               }
56018 +       }
56019 +       if (xen_feature(XENFEAT_auto_translated_physmap))
56020 +               up_write(&info->vma->vm_mm->mmap_sem);
56021 +       
56022 +       /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
56023 +       info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
56024 +
56025 +       blkif_get(blkif);
56026 +       /* Finally, write the request message to the user ring. */
56027 +       target = RING_GET_REQUEST(&info->ufe_ring,
56028 +                                 info->ufe_ring.req_prod_pvt);
56029 +       memcpy(target, req, sizeof(*req));
56030 +       target->id = usr_idx;
56031 +       wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
56032 +       info->ufe_ring.req_prod_pvt++;
56033 +       return;
56034 +
56035 + fail_flush:
56036 +       WPRINTK("Reached Fail_flush\n");
56037 +       fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
56038 + fail_response:
56039 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
56040 +       free_req(pending_req);
56041 +} 
56042 +
56043 +
56044 +
56045 +/******************************************************************
56046 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
56047 + */
56048 +
56049 +
56050 +static void make_response(blkif_t *blkif, unsigned long id, 
56051 +                          unsigned short op, int st)
56052 +{
56053 +       blkif_response_t *resp;
56054 +       unsigned long     flags;
56055 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
56056 +       int more_to_do = 0;
56057 +       int notify;
56058 +
56059 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
56060 +       /* Place on the response ring for the relevant domain. */ 
56061 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
56062 +       resp->id        = id;
56063 +       resp->operation = op;
56064 +       resp->status    = st;
56065 +       blk_ring->rsp_prod_pvt++;
56066 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
56067 +
56068 +       if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
56069 +               /*
56070 +                * Tail check for pending requests. Allows frontend to avoid
56071 +                * notifications if requests are already in flight (lower
56072 +                * overheads and promotes batching).
56073 +                */
56074 +               RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
56075 +       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
56076 +               more_to_do = 1;
56077 +
56078 +       }       
56079 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
56080 +       if (more_to_do)
56081 +               blkif_notify_work(blkif);
56082 +       if (notify)
56083 +               notify_remote_via_irq(blkif->irq);
56084 +}
56085 +
56086 +static int __init blkif_init(void)
56087 +{
56088 +       int i,ret,blktap_dir;
56089 +
56090 +       if (!is_running_on_xen())
56091 +               return -ENODEV;
56092 +
56093 +       INIT_LIST_HEAD(&pending_free);
56094 +        for(i = 0; i < 2; i++) {
56095 +               ret = req_increase();
56096 +               if (ret)
56097 +                       break;
56098 +       }
56099 +       if (i == 0)
56100 +               return ret;
56101 +
56102 +       tap_blkif_interface_init();
56103 +
56104 +       alloc_pending_reqs = 0;
56105 +
56106 +       tap_blkif_xenbus_init();
56107 +
56108 +       /* Dynamically allocate a major for this device */
56109 +       ret = register_chrdev(0, "blktap", &blktap_fops);
56110 +
56111 +       if (ret < 0) {
56112 +               WPRINTK("Couldn't register /dev/xen/blktap\n");
56113 +               return -ENOMEM;
56114 +       }       
56115 +       
56116 +       blktap_major = ret;
56117 +
56118 +       /* tapfds[0] is always NULL */
56119 +       blktap_next_minor++;
56120 +
56121 +       DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
56122 +
56123 +       /* Make sure the xen class exists */
56124 +       if (!setup_xen_class()) {
56125 +               /*
56126 +                * This will allow udev to create the blktap ctrl device.
56127 +                * We only want to create blktap0 first.  We don't want
56128 +                * to flood the sysfs system with needless blktap devices.
56129 +                * We only create the device when a request of a new device is
56130 +                * made.
56131 +                */
56132 +               class_device_create(xen_class, NULL,
56133 +                                   MKDEV(blktap_major, 0), NULL,
56134 +                                   "blktap0");
56135 +       } else {
56136 +               /* this is bad, but not fatal */
56137 +               WPRINTK("blktap: sysfs xen_class not created\n");
56138 +       }
56139 +
56140 +       DPRINTK("Blktap device successfully created\n");
56141 +
56142 +       return 0;
56143 +}
56144 +
56145 +module_init(blkif_init);
56146 +
56147 +MODULE_LICENSE("Dual BSD/GPL");
56148 diff -ruNp linux-2.6.19/drivers/xen/blktap/common.h linux-2.6.19-xen-3.0.4/drivers/xen/blktap/common.h
56149 --- linux-2.6.19/drivers/xen/blktap/common.h    1970-01-01 00:00:00.000000000 +0000
56150 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blktap/common.h  2007-02-02 19:10:45.000000000 +0000
56151 @@ -0,0 +1,120 @@
56152 +/* 
56153 + * This program is free software; you can redistribute it and/or
56154 + * modify it under the terms of the GNU General Public License version 2
56155 + * as published by the Free Software Foundation; or, when distributed
56156 + * separately from the Linux kernel or incorporated into other
56157 + * software packages, subject to the following license:
56158 + * 
56159 + * Permission is hereby granted, free of charge, to any person obtaining a copy
56160 + * of this source file (the "Software"), to deal in the Software without
56161 + * restriction, including without limitation the rights to use, copy, modify,
56162 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
56163 + * and to permit persons to whom the Software is furnished to do so, subject to
56164 + * the following conditions:
56165 + * 
56166 + * The above copyright notice and this permission notice shall be included in
56167 + * all copies or substantial portions of the Software.
56168 + * 
56169 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
56170 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
56171 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
56172 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
56173 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
56174 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
56175 + * IN THE SOFTWARE.
56176 + */
56177 +
56178 +#ifndef __BLKIF__BACKEND__COMMON_H__
56179 +#define __BLKIF__BACKEND__COMMON_H__
56180 +
56181 +#include <linux/version.h>
56182 +#include <linux/module.h>
56183 +#include <linux/interrupt.h>
56184 +#include <linux/slab.h>
56185 +#include <linux/blkdev.h>
56186 +#include <linux/vmalloc.h>
56187 +#include <asm/io.h>
56188 +#include <asm/setup.h>
56189 +#include <asm/pgalloc.h>
56190 +#include <xen/evtchn.h>
56191 +#include <asm/hypervisor.h>
56192 +#include <xen/interface/io/blkif.h>
56193 +#include <xen/interface/io/ring.h>
56194 +#include <xen/gnttab.h>
56195 +#include <xen/driver_util.h>
56196 +
56197 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
56198 +                                    __FILE__ , __LINE__ , ## _a )
56199 +
56200 +#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
56201 +
56202 +struct backend_info;
56203 +
56204 +typedef struct blkif_st {
56205 +       /* Unique identifier for this interface. */
56206 +       domid_t           domid;
56207 +       unsigned int      handle;
56208 +       /* Physical parameters of the comms window. */
56209 +       unsigned int      evtchn;
56210 +       unsigned int      irq;
56211 +       /* Comms information. */
56212 +       blkif_back_ring_t blk_ring;
56213 +       struct vm_struct *blk_ring_area;
56214 +       /* Back pointer to the backend_info. */
56215 +       struct backend_info *be;
56216 +       /* Private fields. */
56217 +       spinlock_t       blk_ring_lock;
56218 +       atomic_t         refcnt;
56219 +
56220 +       wait_queue_head_t   wq;
56221 +       struct task_struct  *xenblkd;
56222 +       unsigned int        waiting_reqs;
56223 +       request_queue_t     *plug;
56224 +
56225 +       /* statistics */
56226 +       unsigned long       st_print;
56227 +       int                 st_rd_req;
56228 +       int                 st_wr_req;
56229 +       int                 st_oo_req;
56230 +
56231 +       wait_queue_head_t waiting_to_free;
56232 +
56233 +       grant_handle_t shmem_handle;
56234 +       grant_ref_t    shmem_ref;
56235 +       
56236 +       int             dev_num;
56237 +       uint64_t        sectors;
56238 +} blkif_t;
56239 +
56240 +blkif_t *tap_alloc_blkif(domid_t domid);
56241 +void tap_blkif_free(blkif_t *blkif);
56242 +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, 
56243 +                 unsigned int evtchn);
56244 +void tap_blkif_unmap(blkif_t *blkif);
56245 +
56246 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
56247 +#define blkif_put(_b)                                  \
56248 +       do {                                            \
56249 +               if (atomic_dec_and_test(&(_b)->refcnt)) \
56250 +                       wake_up(&(_b)->waiting_to_free);\
56251 +       } while (0)
56252 +
56253 +
56254 +struct phys_req {
56255 +       unsigned short       dev;
56256 +       unsigned short       nr_sects;
56257 +       struct block_device *bdev;
56258 +       blkif_sector_t       sector_number;
56259 +};
56260 +
56261 +void tap_blkif_interface_init(void);
56262 +
56263 +void tap_blkif_xenbus_init(void);
56264 +
56265 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
56266 +int tap_blkif_schedule(void *arg);
56267 +
56268 +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
56269 +void signal_tapdisk(int idx);
56270 +
56271 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
56272 diff -ruNp linux-2.6.19/drivers/xen/blktap/interface.c linux-2.6.19-xen-3.0.4/drivers/xen/blktap/interface.c
56273 --- linux-2.6.19/drivers/xen/blktap/interface.c 1970-01-01 00:00:00.000000000 +0000
56274 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blktap/interface.c       2007-02-02 19:10:45.000000000 +0000
56275 @@ -0,0 +1,164 @@
56276 +/******************************************************************************
56277 + * drivers/xen/blktap/interface.c
56278 + * 
56279 + * Block-device interface management.
56280 + * 
56281 + * Copyright (c) 2004, Keir Fraser
56282 + *
56283 + * This program is free software; you can redistribute it and/or
56284 + * modify it under the terms of the GNU General Public License version 2
56285 + * as published by the Free Software Foundation; or, when distributed
56286 + * separately from the Linux kernel or incorporated into other
56287 + * software packages, subject to the following license:
56288 + *
56289 + * Permission is hereby granted, free of charge, to any person obtaining a copy
56290 + * of this source file (the "Software"), to deal in the Software without
56291 + * restriction, including without limitation the rights to use, copy, modify,
56292 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
56293 + * and to permit persons to whom the Software is furnished to do so, subject to
56294 + * the following conditions:
56295 + *
56296 + * The above copyright notice and this permission notice shall be included in
56297 + * all copies or substantial portions of the Software.
56298 + *
56299 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
56300 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
56301 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
56302 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
56303 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
56304 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
56305 + * IN THE SOFTWARE.
56306 +
56307 + */
56308 +
56309 +#include "common.h"
56310 +#include <xen/evtchn.h>
56311 +
56312 +static kmem_cache_t *blkif_cachep;
56313 +
56314 +blkif_t *tap_alloc_blkif(domid_t domid)
56315 +{
56316 +       blkif_t *blkif;
56317 +
56318 +       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
56319 +       if (!blkif)
56320 +               return ERR_PTR(-ENOMEM);
56321 +
56322 +       memset(blkif, 0, sizeof(*blkif));
56323 +       blkif->domid = domid;
56324 +       spin_lock_init(&blkif->blk_ring_lock);
56325 +       atomic_set(&blkif->refcnt, 1);
56326 +       init_waitqueue_head(&blkif->wq);
56327 +       blkif->st_print = jiffies;
56328 +       init_waitqueue_head(&blkif->waiting_to_free);
56329 +
56330 +       return blkif;
56331 +}
56332 +
56333 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
56334 +{
56335 +       struct gnttab_map_grant_ref op;
56336 +       int ret;
56337 +
56338 +       gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
56339 +                         GNTMAP_host_map, shared_page, blkif->domid);
56340 +
56341 +       lock_vm_area(blkif->blk_ring_area);
56342 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
56343 +       unlock_vm_area(blkif->blk_ring_area);
56344 +       BUG_ON(ret);
56345 +
56346 +       if (op.status) {
56347 +               DPRINTK(" Grant table operation failure !\n");
56348 +               return op.status;
56349 +       }
56350 +
56351 +       blkif->shmem_ref = shared_page;
56352 +       blkif->shmem_handle = op.handle;
56353 +
56354 +       return 0;
56355 +}
56356 +
56357 +static void unmap_frontend_page(blkif_t *blkif)
56358 +{
56359 +       struct gnttab_unmap_grant_ref op;
56360 +       int ret;
56361 +
56362 +       gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
56363 +                           GNTMAP_host_map, blkif->shmem_handle);
56364 +
56365 +       lock_vm_area(blkif->blk_ring_area);
56366 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
56367 +       unlock_vm_area(blkif->blk_ring_area);
56368 +       BUG_ON(ret);
56369 +}
56370 +
56371 +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, 
56372 +                 unsigned int evtchn)
56373 +{
56374 +       blkif_sring_t *sring;
56375 +       int err;
56376 +       struct evtchn_bind_interdomain bind_interdomain;
56377 +
56378 +       /* Already connected through? */
56379 +       if (blkif->irq)
56380 +               return 0;
56381 +
56382 +       if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
56383 +               return -ENOMEM;
56384 +
56385 +       err = map_frontend_page(blkif, shared_page);
56386 +       if (err) {
56387 +               free_vm_area(blkif->blk_ring_area);
56388 +               return err;
56389 +       }
56390 +
56391 +       bind_interdomain.remote_dom  = blkif->domid;
56392 +       bind_interdomain.remote_port = evtchn;
56393 +
56394 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
56395 +                                         &bind_interdomain);
56396 +       if (err) {
56397 +               unmap_frontend_page(blkif);
56398 +               free_vm_area(blkif->blk_ring_area);
56399 +               return err;
56400 +       }
56401 +
56402 +       blkif->evtchn = bind_interdomain.local_port;
56403 +
56404 +       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
56405 +       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
56406 +
56407 +       blkif->irq = bind_evtchn_to_irqhandler(
56408 +               blkif->evtchn, tap_blkif_be_int, 0, "blkif-backend", blkif);
56409 +
56410 +       return 0;
56411 +}
56412 +
56413 +void tap_blkif_unmap(blkif_t *blkif)
56414 +{
56415 +       if (blkif->irq) {
56416 +               unbind_from_irqhandler(blkif->irq, blkif);
56417 +               blkif->irq = 0;
56418 +       }
56419 +       if (blkif->blk_ring.sring) {
56420 +               unmap_frontend_page(blkif);
56421 +               free_vm_area(blkif->blk_ring_area);
56422 +               blkif->blk_ring.sring = NULL;
56423 +       }
56424 +}
56425 +
56426 +void tap_blkif_free(blkif_t *blkif)
56427 +{
56428 +       atomic_dec(&blkif->refcnt);
56429 +       wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
56430 +
56431 +       tap_blkif_unmap(blkif);
56432 +       kmem_cache_free(blkif_cachep, blkif);
56433 +}
56434 +
56435 +void __init tap_blkif_interface_init(void)
56436 +{
56437 +       blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), 
56438 +                                        0, 0, NULL, NULL);
56439 +}
56440 diff -ruNp linux-2.6.19/drivers/xen/blktap/xenbus.c linux-2.6.19-xen-3.0.4/drivers/xen/blktap/xenbus.c
56441 --- linux-2.6.19/drivers/xen/blktap/xenbus.c    1970-01-01 00:00:00.000000000 +0000
56442 +++ linux-2.6.19-xen-3.0.4/drivers/xen/blktap/xenbus.c  2007-02-02 19:10:45.000000000 +0000
56443 @@ -0,0 +1,366 @@
56444 +/* drivers/xen/blktap/xenbus.c
56445 + *
56446 + * Xenbus code for blktap
56447 + *
56448 + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
56449 + *
56450 + * Based on the blkback xenbus code:
56451 + *
56452 + * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
56453 + * Copyright (C) 2005 XenSource Ltd
56454 + *
56455 + * This program is free software; you can redistribute it and/or
56456 + * modify it under the terms of the GNU General Public License version 2
56457 + * as published by the Free Software Foundation; or, when distributed
56458 + * separately from the Linux kernel or incorporated into other
56459 + * software packages, subject to the following license:
56460 + *
56461 + * Permission is hereby granted, free of charge, to any person obtaining a copy
56462 + * of this source file (the "Software"), to deal in the Software without
56463 + * restriction, including without limitation the rights to use, copy, modify,
56464 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
56465 + * and to permit persons to whom the Software is furnished to do so, subject to
56466 + * the following conditions:
56467 + *
56468 + * The above copyright notice and this permission notice shall be included in
56469 + * all copies or substantial portions of the Software.
56470 + *
56471 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
56472 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
56473 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
56474 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
56475 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
56476 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
56477 + * IN THE SOFTWARE.
56478 + */
56479 +
56480 +#include <stdarg.h>
56481 +#include <linux/module.h>
56482 +#include <linux/kthread.h>
56483 +#include <xen/xenbus.h>
56484 +#include "common.h"
56485 +
56486 +
56487 +struct backend_info
56488 +{
56489 +       struct xenbus_device *dev;
56490 +       blkif_t *blkif;
56491 +       struct xenbus_watch backend_watch;
56492 +       int xenbus_id;
56493 +};
56494 +
56495 +
56496 +static void connect(struct backend_info *);
56497 +static int connect_ring(struct backend_info *);
56498 +static int blktap_remove(struct xenbus_device *dev);
56499 +static int blktap_probe(struct xenbus_device *dev,
56500 +                        const struct xenbus_device_id *id);
56501 +static void tap_backend_changed(struct xenbus_watch *, const char **,
56502 +                           unsigned int);
56503 +static void tap_frontend_changed(struct xenbus_device *dev,
56504 +                            enum xenbus_state frontend_state);
56505 +
56506 +static int strsep_len(const char *str, char c, unsigned int len)
56507 +{
56508 +        unsigned int i;
56509 +
56510 +        for (i = 0; str[i]; i++)
56511 +                if (str[i] == c) {
56512 +                        if (len == 0)
56513 +                                return i;
56514 +                        len--;
56515 +                }
56516 +        return (len == 0) ? i : -ERANGE;
56517 +}
56518 +
56519 +static long get_id(const char *str)
56520 +{
56521 +        int len,end;
56522 +        const char *ptr;
56523 +        char *tptr, num[10];
56524 +       
56525 +        len = strsep_len(str, '/', 2);
56526 +        end = strlen(str);
56527 +        if ( (len < 0) || (end < 0) ) return -1;
56528 +       
56529 +        ptr = str + len + 1;
56530 +        strncpy(num,ptr,end - len);
56531 +        tptr = num + (end - (len + 1));
56532 +        *tptr = '\0';
56533 +       DPRINTK("Get_id called for %s (%s)\n",str,num);
56534 +       
56535 +        return simple_strtol(num, NULL, 10);
56536 +}                              
56537 +
56538 +static void tap_update_blkif_status(blkif_t *blkif)
56539 +{ 
56540 +       int err;
56541 +
56542 +       /* Not ready to connect? */
56543 +       if(!blkif->irq || !blkif->sectors) {
56544 +               return;
56545 +       } 
56546 +
56547 +       /* Already connected? */
56548 +       if (blkif->be->dev->state == XenbusStateConnected)
56549 +               return;
56550 +
56551 +       /* Attempt to connect: exit if we fail to. */
56552 +       connect(blkif->be);
56553 +       if (blkif->be->dev->state != XenbusStateConnected)
56554 +               return;
56555 +
56556 +       blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif,
56557 +                                    "xvd %d",
56558 +                                    blkif->domid);
56559 +
56560 +       if (IS_ERR(blkif->xenblkd)) {
56561 +               err = PTR_ERR(blkif->xenblkd);
56562 +               blkif->xenblkd = NULL;
56563 +               xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
56564 +               WPRINTK("Error starting thread\n");
56565 +       }
56566 +}
56567 +
56568 +static int blktap_remove(struct xenbus_device *dev)
56569 +{
56570 +       struct backend_info *be = dev->dev.driver_data;
56571 +
56572 +       if (be->backend_watch.node) {
56573 +               unregister_xenbus_watch(&be->backend_watch);
56574 +               kfree(be->backend_watch.node);
56575 +               be->backend_watch.node = NULL;
56576 +       }
56577 +       if (be->blkif) {
56578 +               if (be->blkif->xenblkd)
56579 +                       kthread_stop(be->blkif->xenblkd);
56580 +               signal_tapdisk(be->blkif->dev_num);
56581 +               tap_blkif_free(be->blkif);
56582 +               be->blkif = NULL;
56583 +       }
56584 +       kfree(be);
56585 +       dev->dev.driver_data = NULL;
56586 +       return 0;
56587 +}
56588 +
56589 +/**
56590 + * Entry point to this code when a new device is created.  Allocate
56591 + * the basic structures, and watch the store waiting for the
56592 + * user-space program to tell us the physical device info.  Switch to
56593 + * InitWait.
56594 + */
56595 +static int blktap_probe(struct xenbus_device *dev,
56596 +                        const struct xenbus_device_id *id)
56597 +{
56598 +       int err;
56599 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
56600 +                                         GFP_KERNEL);
56601 +       if (!be) {
56602 +               xenbus_dev_fatal(dev, -ENOMEM,
56603 +                                "allocating backend structure");
56604 +               return -ENOMEM;
56605 +       }
56606 +
56607 +       be->dev = dev;
56608 +       dev->dev.driver_data = be;
56609 +       be->xenbus_id = get_id(dev->nodename);
56610 +
56611 +       be->blkif = tap_alloc_blkif(dev->otherend_id);
56612 +       if (IS_ERR(be->blkif)) {
56613 +               err = PTR_ERR(be->blkif);
56614 +               be->blkif = NULL;
56615 +               xenbus_dev_fatal(dev, err, "creating block interface");
56616 +               goto fail;
56617 +       }
56618 +
56619 +       /* setup back pointer */
56620 +       be->blkif->be = be;
56621 +       be->blkif->sectors = 0;
56622 +
56623 +       /* set a watch on disk info, waiting for userspace to update details*/
56624 +       err = xenbus_watch_path2(dev, dev->nodename, "info",
56625 +                                &be->backend_watch, tap_backend_changed);
56626 +       if (err)
56627 +               goto fail;
56628 +       
56629 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
56630 +       if (err)
56631 +               goto fail;
56632 +       return 0;
56633 +
56634 +fail:
56635 +       DPRINTK("blktap probe failed\n");
56636 +       blktap_remove(dev);
56637 +       return err;
56638 +}
56639 +
56640 +
56641 +/**
56642 + * Callback received when the user space code has placed the device
56643 + * information in xenstore. 
56644 + */
56645 +static void tap_backend_changed(struct xenbus_watch *watch,
56646 +                           const char **vec, unsigned int len)
56647 +{
56648 +       int err;
56649 +       unsigned long info;
56650 +       struct backend_info *be
56651 +               = container_of(watch, struct backend_info, backend_watch);
56652 +       struct xenbus_device *dev = be->dev;
56653 +       
56654 +       /** 
56655 +        * Check to see whether userspace code has opened the image 
56656 +        * and written sector
56657 +        * and disk info to xenstore
56658 +        */
56659 +       err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, 
56660 +                           NULL);      
56661 +       if (err) {
56662 +               xenbus_dev_error(dev, err, "getting info");
56663 +               return;
56664 +       }
56665 +
56666 +       DPRINTK("Userspace update on disk info, %lu\n",info);
56667 +
56668 +       err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu", 
56669 +                           &be->blkif->sectors, NULL);
56670 +
56671 +       /* Associate tap dev with domid*/
56672 +       be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, 
56673 +                                         be->blkif);
56674 +       DPRINTK("Thread started for domid [%d], connecting disk\n", 
56675 +               be->blkif->dev_num);
56676 +
56677 +       tap_update_blkif_status(be->blkif);
56678 +}
56679 +
56680 +/**
56681 + * Callback received when the frontend's state changes.
56682 + */
56683 +static void tap_frontend_changed(struct xenbus_device *dev,
56684 +                            enum xenbus_state frontend_state)
56685 +{
56686 +       struct backend_info *be = dev->dev.driver_data;
56687 +       int err;
56688 +
56689 +       DPRINTK("\n");
56690 +
56691 +       switch (frontend_state) {
56692 +       case XenbusStateInitialising:
56693 +               if (dev->state == XenbusStateClosed) {
56694 +                       printk("%s: %s: prepare for reconnect\n",
56695 +                              __FUNCTION__, dev->nodename);
56696 +                       xenbus_switch_state(dev, XenbusStateInitWait);
56697 +               }
56698 +               break;
56699 +
56700 +       case XenbusStateInitialised:
56701 +       case XenbusStateConnected:
56702 +               /* Ensure we connect even when two watches fire in 
56703 +                  close successsion and we miss the intermediate value 
56704 +                  of frontend_state. */
56705 +               if (dev->state == XenbusStateConnected)
56706 +                       break;
56707 +
56708 +               err = connect_ring(be);
56709 +               if (err)
56710 +                       break;
56711 +               tap_update_blkif_status(be->blkif);
56712 +               break;
56713 +
56714 +       case XenbusStateClosing:
56715 +               if (be->blkif->xenblkd) {
56716 +                       kthread_stop(be->blkif->xenblkd);
56717 +                       be->blkif->xenblkd = NULL;
56718 +               }
56719 +               xenbus_switch_state(dev, XenbusStateClosing);
56720 +               break;
56721 +
56722 +       case XenbusStateClosed:
56723 +               xenbus_switch_state(dev, XenbusStateClosed);
56724 +               if (xenbus_dev_is_online(dev))
56725 +                       break;
56726 +               /* fall through if not online */
56727 +       case XenbusStateUnknown:
56728 +               device_unregister(&dev->dev);
56729 +               break;
56730 +
56731 +       default:
56732 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
56733 +                                frontend_state);
56734 +               break;
56735 +       }
56736 +}
56737 +
56738 +
56739 +/**
56740 + * Switch to Connected state.
56741 + */
56742 +static void connect(struct backend_info *be)
56743 +{
56744 +       int err;
56745 +
56746 +       struct xenbus_device *dev = be->dev;
56747 +
56748 +       err = xenbus_switch_state(dev, XenbusStateConnected);
56749 +       if (err)
56750 +               xenbus_dev_fatal(dev, err, "switching to Connected state",
56751 +                                dev->nodename);
56752 +
56753 +       return;
56754 +}
56755 +
56756 +
56757 +static int connect_ring(struct backend_info *be)
56758 +{
56759 +       struct xenbus_device *dev = be->dev;
56760 +       unsigned long ring_ref;
56761 +       unsigned int evtchn;
56762 +       int err;
56763 +
56764 +       DPRINTK("%s\n", dev->otherend);
56765 +
56766 +       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", 
56767 +                           &ring_ref, "event-channel", "%u", &evtchn, NULL);
56768 +       if (err) {
56769 +               xenbus_dev_fatal(dev, err,
56770 +                                "reading %s/ring-ref and event-channel",
56771 +                                dev->otherend);
56772 +               return err;
56773 +       }
56774 +
56775 +       /* Map the shared frame, irq etc. */
56776 +       err = tap_blkif_map(be->blkif, ring_ref, evtchn);
56777 +       if (err) {
56778 +               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
56779 +                                ring_ref, evtchn);
56780 +               return err;
56781 +       } 
56782 +
56783 +       return 0;
56784 +}
56785 +
56786 +
56787 +/* ** Driver Registration ** */
56788 +
56789 +
56790 +static struct xenbus_device_id blktap_ids[] = {
56791 +       { "tap" },
56792 +       { "" }
56793 +};
56794 +
56795 +
56796 +static struct xenbus_driver blktap = {
56797 +       .name = "tap",
56798 +       .owner = THIS_MODULE,
56799 +       .ids = blktap_ids,
56800 +       .probe = blktap_probe,
56801 +       .remove = blktap_remove,
56802 +       .otherend_changed = tap_frontend_changed
56803 +};
56804 +
56805 +
56806 +void tap_blkif_xenbus_init(void)
56807 +{
56808 +       xenbus_register_backend(&blktap);
56809 +}
56810 diff -ruNp linux-2.6.19/drivers/xen/char/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/char/Makefile
56811 --- linux-2.6.19/drivers/xen/char/Makefile      1970-01-01 00:00:00.000000000 +0000
56812 +++ linux-2.6.19-xen-3.0.4/drivers/xen/char/Makefile    2007-02-02 19:10:45.000000000 +0000
56813 @@ -0,0 +1,2 @@
56814 +
56815 +obj-y  := mem.o
56816 diff -ruNp linux-2.6.19/drivers/xen/char/mem.c linux-2.6.19-xen-3.0.4/drivers/xen/char/mem.c
56817 --- linux-2.6.19/drivers/xen/char/mem.c 1970-01-01 00:00:00.000000000 +0000
56818 +++ linux-2.6.19-xen-3.0.4/drivers/xen/char/mem.c       2007-02-02 19:10:45.000000000 +0000
56819 @@ -0,0 +1,203 @@
56820 +/*
56821 + *  Originally from linux/drivers/char/mem.c
56822 + *
56823 + *  Copyright (C) 1991, 1992  Linus Torvalds
56824 + *
56825 + *  Added devfs support. 
56826 + *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
56827 + *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
56828 + */
56829 +
56830 +#include <linux/mm.h>
56831 +#include <linux/miscdevice.h>
56832 +#include <linux/slab.h>
56833 +#include <linux/vmalloc.h>
56834 +#include <linux/mman.h>
56835 +#include <linux/random.h>
56836 +#include <linux/init.h>
56837 +#include <linux/raw.h>
56838 +#include <linux/tty.h>
56839 +#include <linux/capability.h>
56840 +#include <linux/smp_lock.h>
56841 +#include <linux/ptrace.h>
56842 +#include <linux/device.h>
56843 +#include <asm/pgalloc.h>
56844 +#include <asm/uaccess.h>
56845 +#include <asm/io.h>
56846 +#include <asm/hypervisor.h>
56847 +
56848 +#ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE
56849 +static inline int valid_phys_addr_range(unsigned long addr, size_t *count)
56850 +{
56851 +       return 1;
56852 +}
56853 +#endif
56854 +
56855 +/*
56856 + * This funcion reads the *physical* memory. The f_pos points directly to the 
56857 + * memory location. 
56858 + */
56859 +static ssize_t read_mem(struct file * file, char __user * buf,
56860 +                       size_t count, loff_t *ppos)
56861 +{
56862 +       unsigned long p = *ppos, ignored;
56863 +       ssize_t read = 0, sz;
56864 +       void __iomem *v;
56865 +
56866 +       if (!valid_phys_addr_range(p, &count))
56867 +               return -EFAULT;
56868 +
56869 +       while (count > 0) {
56870 +               /*
56871 +                * Handle first page in case it's not aligned
56872 +                */
56873 +               if (-p & (PAGE_SIZE - 1))
56874 +                       sz = -p & (PAGE_SIZE - 1);
56875 +               else
56876 +                       sz = PAGE_SIZE;
56877 +
56878 +               sz = min_t(unsigned long, sz, count);
56879 +
56880 +               v = xlate_dev_mem_ptr(p, sz);
56881 +               if (IS_ERR(v) || v == NULL) {
56882 +                       /*
56883 +                        * Some programs (e.g., dmidecode) groove off into
56884 +                        * weird RAM areas where no tables can possibly exist
56885 +                        * (because Xen will have stomped on them!). These
56886 +                        * programs get rather upset if we let them know that
56887 +                        * Xen failed their access, so we fake out a read of
56888 +                        * all zeroes.
56889 +                        */
56890 +                       if (clear_user(buf, count))
56891 +                               return -EFAULT;
56892 +                       read += count;
56893 +                       break;
56894 +               }
56895 +
56896 +               ignored = copy_to_user(buf, v, sz);
56897 +               xlate_dev_mem_ptr_unmap(v);
56898 +               if (ignored)
56899 +                       return -EFAULT;
56900 +               buf += sz;
56901 +               p += sz;
56902 +               count -= sz;
56903 +               read += sz;
56904 +       }
56905 +
56906 +       *ppos += read;
56907 +       return read;
56908 +}
56909 +
56910 +static ssize_t write_mem(struct file * file, const char __user * buf, 
56911 +                        size_t count, loff_t *ppos)
56912 +{
56913 +       unsigned long p = *ppos, ignored;
56914 +       ssize_t written = 0, sz;
56915 +       void __iomem *v;
56916 +
56917 +       if (!valid_phys_addr_range(p, &count))
56918 +               return -EFAULT;
56919 +
56920 +       while (count > 0) {
56921 +               /*
56922 +                * Handle first page in case it's not aligned
56923 +                */
56924 +               if (-p & (PAGE_SIZE - 1))
56925 +                       sz = -p & (PAGE_SIZE - 1);
56926 +               else
56927 +                       sz = PAGE_SIZE;
56928 +
56929 +               sz = min_t(unsigned long, sz, count);
56930 +
56931 +               v = xlate_dev_mem_ptr(p, sz);
56932 +               if (v == NULL)
56933 +                       break;
56934 +               if (IS_ERR(v)) {
56935 +                       if (written == 0)
56936 +                               return PTR_ERR(v);
56937 +                       break;
56938 +               }
56939 +
56940 +               ignored = copy_from_user(v, buf, sz);
56941 +               xlate_dev_mem_ptr_unmap(v);
56942 +               if (ignored) {
56943 +                       written += sz - ignored;
56944 +                       if (written)
56945 +                               break;
56946 +                       return -EFAULT;
56947 +               }
56948 +               buf += sz;
56949 +               p += sz;
56950 +               count -= sz;
56951 +               written += sz;
56952 +       }
56953 +
56954 +       *ppos += written;
56955 +       return written;
56956 +}
56957 +
56958 +#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
56959 +static inline int uncached_access(struct file *file)
56960 +{
56961 +       if (file->f_flags & O_SYNC)
56962 +               return 1;
56963 +       /* Xen sets correct MTRR type on non-RAM for us. */
56964 +       return 0;
56965 +}
56966 +
56967 +static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
56968 +{
56969 +       size_t size = vma->vm_end - vma->vm_start;
56970 +
56971 +       if (uncached_access(file))
56972 +               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
56973 +
56974 +       /* We want to return the real error code, not EAGAIN. */
56975 +       return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
56976 +                                     size, vma->vm_page_prot, DOMID_IO);
56977 +}
56978 +#endif
56979 +
56980 +/*
56981 + * The memory devices use the full 32/64 bits of the offset, and so we cannot
56982 + * check against negative addresses: they are ok. The return value is weird,
56983 + * though, in that case (0).
56984 + *
56985 + * also note that seeking relative to the "end of file" isn't supported:
56986 + * it has no meaning, so it returns -EINVAL.
56987 + */
56988 +static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
56989 +{
56990 +       loff_t ret;
56991 +
56992 +       mutex_lock(&file->f_dentry->d_inode->i_mutex);
56993 +       switch (orig) {
56994 +               case 0:
56995 +                       file->f_pos = offset;
56996 +                       ret = file->f_pos;
56997 +                       force_successful_syscall_return();
56998 +                       break;
56999 +               case 1:
57000 +                       file->f_pos += offset;
57001 +                       ret = file->f_pos;
57002 +                       force_successful_syscall_return();
57003 +                       break;
57004 +               default:
57005 +                       ret = -EINVAL;
57006 +       }
57007 +       mutex_unlock(&file->f_dentry->d_inode->i_mutex);
57008 +       return ret;
57009 +}
57010 +
57011 +static int open_mem(struct inode * inode, struct file * filp)
57012 +{
57013 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
57014 +}
57015 +
57016 +struct file_operations mem_fops = {
57017 +       .llseek         = memory_lseek,
57018 +       .read           = read_mem,
57019 +       .write          = write_mem,
57020 +       .mmap           = xen_mmap_mem,
57021 +       .open           = open_mem,
57022 +};
57023 diff -ruNp linux-2.6.19/drivers/xen/console/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/console/Makefile
57024 --- linux-2.6.19/drivers/xen/console/Makefile   1970-01-01 00:00:00.000000000 +0000
57025 +++ linux-2.6.19-xen-3.0.4/drivers/xen/console/Makefile 2007-02-02 19:10:45.000000000 +0000
57026 @@ -0,0 +1,2 @@
57027 +
57028 +obj-y  := console.o xencons_ring.o
57029 diff -ruNp linux-2.6.19/drivers/xen/console/console.c linux-2.6.19-xen-3.0.4/drivers/xen/console/console.c
57030 --- linux-2.6.19/drivers/xen/console/console.c  1970-01-01 00:00:00.000000000 +0000
57031 +++ linux-2.6.19-xen-3.0.4/drivers/xen/console/console.c        2007-02-02 19:10:45.000000000 +0000
57032 @@ -0,0 +1,717 @@
57033 +/******************************************************************************
57034 + * console.c
57035 + * 
57036 + * Virtual console driver.
57037 + * 
57038 + * Copyright (c) 2002-2004, K A Fraser.
57039 + * 
57040 + * This program is free software; you can redistribute it and/or
57041 + * modify it under the terms of the GNU General Public License version 2
57042 + * as published by the Free Software Foundation; or, when distributed
57043 + * separately from the Linux kernel or incorporated into other
57044 + * software packages, subject to the following license:
57045 + * 
57046 + * Permission is hereby granted, free of charge, to any person obtaining a copy
57047 + * of this source file (the "Software"), to deal in the Software without
57048 + * restriction, including without limitation the rights to use, copy, modify,
57049 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57050 + * and to permit persons to whom the Software is furnished to do so, subject to
57051 + * the following conditions:
57052 + * 
57053 + * The above copyright notice and this permission notice shall be included in
57054 + * all copies or substantial portions of the Software.
57055 + * 
57056 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57057 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57058 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57059 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57060 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57061 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57062 + * IN THE SOFTWARE.
57063 + */
57064 +
57065 +#include <linux/version.h>
57066 +#include <linux/module.h>
57067 +#include <linux/errno.h>
57068 +#include <linux/signal.h>
57069 +#include <linux/sched.h>
57070 +#include <linux/interrupt.h>
57071 +#include <linux/tty.h>
57072 +#include <linux/tty_flip.h>
57073 +#include <linux/vt.h>
57074 +#include <linux/serial.h>
57075 +#include <linux/major.h>
57076 +#include <linux/ptrace.h>
57077 +#include <linux/ioport.h>
57078 +#include <linux/mm.h>
57079 +#include <linux/slab.h>
57080 +#include <linux/init.h>
57081 +#include <linux/console.h>
57082 +#include <linux/bootmem.h>
57083 +#include <linux/sysrq.h>
57084 +#include <linux/screen_info.h>
57085 +#include <asm/io.h>
57086 +#include <asm/irq.h>
57087 +#include <asm/uaccess.h>
57088 +#include <xen/interface/xen.h>
57089 +#include <xen/interface/event_channel.h>
57090 +#include <asm/hypervisor.h>
57091 +#include <xen/evtchn.h>
57092 +#include <xen/xenbus.h>
57093 +#include <xen/xencons.h>
57094 +
57095 +/*
57096 + * Modes:
57097 + *  'xencons=off'  [XC_OFF]:     Console is disabled.
57098 + *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
57099 + *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
57100 + *  'xencons=xvc'  [XC_XVC]:     Console attached to '/dev/xvc0'.
57101 + *  default:                     DOM0 -> XC_SERIAL ; all others -> XC_TTY.
57102 + * 
57103 + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
57104 + * warnings from standard distro startup scripts.
57105 + */
57106 +static enum {
57107 +       XC_OFF, XC_TTY, XC_SERIAL, XC_XVC
57108 +} xc_mode;
57109 +static int xc_num = -1;
57110 +
57111 +/* /dev/xvc0 device number allocated by lanana.org. */
57112 +#define XEN_XVC_MAJOR 204
57113 +#define XEN_XVC_MINOR 191
57114 +
57115 +#ifdef CONFIG_MAGIC_SYSRQ
57116 +static unsigned long sysrq_requested;
57117 +extern int sysrq_enabled;
57118 +#endif
57119 +
57120 +void xencons_early_setup(void)
57121 +{
57122 +       extern int console_use_vt;
57123 +
57124 +       if (is_initial_xendomain()) {
57125 +               xc_mode = XC_SERIAL;
57126 +       } else {
57127 +               xc_mode = XC_TTY;
57128 +               console_use_vt = 0;
57129 +       }
57130 +}
57131 +
57132 +static int __init xencons_setup(char *str)
57133 +{
57134 +       char *q;
57135 +       int n;
57136 +       extern int console_use_vt;
57137 +
57138 +       console_use_vt = 1;
57139 +       if (!strncmp(str, "ttyS", 4)) {
57140 +               xc_mode = XC_SERIAL;
57141 +               str += 4;
57142 +       } else if (!strncmp(str, "tty", 3)) {
57143 +               xc_mode = XC_TTY;
57144 +               str += 3;
57145 +               console_use_vt = 0;
57146 +       } else if (!strncmp(str, "xvc", 3)) {
57147 +               xc_mode = XC_XVC;
57148 +               str += 3;
57149 +       } else if (!strncmp(str, "off", 3)) {
57150 +               xc_mode = XC_OFF;
57151 +               str += 3;
57152 +       }
57153 +
57154 +       n = simple_strtol(str, &q, 10);
57155 +       if (q != str)
57156 +               xc_num = n;
57157 +
57158 +       return 1;
57159 +}
57160 +__setup("xencons=", xencons_setup);
57161 +
57162 +/* The kernel and user-land drivers share a common transmit buffer. */
57163 +static unsigned int wbuf_size = 4096;
57164 +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
57165 +static char *wbuf;
57166 +static unsigned int wc, wp; /* write_cons, write_prod */
57167 +
57168 +static int __init xencons_bufsz_setup(char *str)
57169 +{
57170 +       unsigned int goal;
57171 +       goal = simple_strtoul(str, NULL, 0);
57172 +       if (goal) {
57173 +               goal = roundup_pow_of_two(goal);
57174 +               if (wbuf_size < goal)
57175 +                       wbuf_size = goal;
57176 +       }
57177 +       return 1;
57178 +}
57179 +__setup("xencons_bufsz=", xencons_bufsz_setup);
57180 +
57181 +/* This lock protects accesses to the common transmit buffer. */
57182 +static DEFINE_SPINLOCK(xencons_lock);
57183 +
57184 +/* Common transmit-kick routine. */
57185 +static void __xencons_tx_flush(void);
57186 +
57187 +static struct tty_driver *xencons_driver;
57188 +
57189 +/******************** Kernel console driver ********************************/
57190 +
57191 +static void kcons_write(struct console *c, const char *s, unsigned int count)
57192 +{
57193 +       int           i = 0;
57194 +       unsigned long flags;
57195 +
57196 +       spin_lock_irqsave(&xencons_lock, flags);
57197 +
57198 +       while (i < count) {
57199 +               for (; i < count; i++) {
57200 +                       if ((wp - wc) >= (wbuf_size - 1))
57201 +                               break;
57202 +                       if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
57203 +                               wbuf[WBUF_MASK(wp++)] = '\r';
57204 +               }
57205 +
57206 +               __xencons_tx_flush();
57207 +       }
57208 +
57209 +       spin_unlock_irqrestore(&xencons_lock, flags);
57210 +}
57211 +
57212 +static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
57213 +{
57214 +
57215 +       while (count > 0) {
57216 +               int rc;
57217 +               rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
57218 +               if (rc <= 0)
57219 +                       break;
57220 +               count -= rc;
57221 +               s += rc;
57222 +       }
57223 +}
57224 +
57225 +static struct tty_driver *kcons_device(struct console *c, int *index)
57226 +{
57227 +       *index = 0;
57228 +       return xencons_driver;
57229 +}
57230 +
57231 +static struct console kcons_info = {
57232 +       .device = kcons_device,
57233 +       .flags  = CON_PRINTBUFFER | CON_ENABLED,
57234 +       .index  = -1,
57235 +};
57236 +
57237 +static int __init xen_console_init(void)
57238 +{
57239 +       if (!is_running_on_xen())
57240 +               goto out;
57241 +
57242 +       if (is_initial_xendomain()) {
57243 +               kcons_info.write = kcons_write_dom0;
57244 +       } else {
57245 +               if (!xen_start_info->console.domU.evtchn)
57246 +                       goto out;
57247 +               kcons_info.write = kcons_write;
57248 +       }
57249 +
57250 +       switch (xc_mode) {
57251 +       case XC_XVC:
57252 +               strcpy(kcons_info.name, "xvc");
57253 +               if (xc_num == -1)
57254 +                       xc_num = 0;
57255 +               break;
57256 +
57257 +       case XC_SERIAL:
57258 +               strcpy(kcons_info.name, "ttyS");
57259 +               if (xc_num == -1)
57260 +                       xc_num = 0;
57261 +               break;
57262 +
57263 +       case XC_TTY:
57264 +               strcpy(kcons_info.name, "tty");
57265 +               if (xc_num == -1)
57266 +                       xc_num = 1;
57267 +               break;
57268 +
57269 +       default:
57270 +               goto out;
57271 +       }
57272 +
57273 +       wbuf = alloc_bootmem(wbuf_size);
57274 +
57275 +       register_console(&kcons_info);
57276 +
57277 + out:
57278 +       return 0;
57279 +}
57280 +console_initcall(xen_console_init);
57281 +
57282 +/*** Useful function for console debugging -- goes straight to Xen. ***/
57283 +asmlinkage int xprintk(const char *fmt, ...)
57284 +{
57285 +       va_list args;
57286 +       int printk_len;
57287 +       static char printk_buf[1024];
57288 +
57289 +       /* Emit the output into the temporary buffer */
57290 +       va_start(args, fmt);
57291 +       printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
57292 +       va_end(args);
57293 +
57294 +       /* Send the processed output directly to Xen. */
57295 +       kcons_write_dom0(NULL, printk_buf, printk_len);
57296 +
57297 +       return 0;
57298 +}
57299 +
57300 +/*** Forcibly flush console data before dying. ***/
57301 +void xencons_force_flush(void)
57302 +{
57303 +       int sz;
57304 +
57305 +       /* Emergency console is synchronous, so there's nothing to flush. */
57306 +       if (!is_running_on_xen() ||
57307 +           is_initial_xendomain() ||
57308 +           !xen_start_info->console.domU.evtchn)
57309 +               return;
57310 +
57311 +       /* Spin until console data is flushed through to the daemon. */
57312 +       while (wc != wp) {
57313 +               int sent = 0;
57314 +               if ((sz = wp - wc) == 0)
57315 +                       continue;
57316 +               sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
57317 +               if (sent > 0)
57318 +                       wc += sent;
57319 +       }
57320 +}
57321 +
57322 +
57323 +void dom0_init_screen_info(const struct dom0_vga_console_info *info)
57324 +{
57325 +       switch (info->video_type) {
57326 +       case XEN_VGATYPE_TEXT_MODE_3:
57327 +               screen_info.orig_video_mode = 3;
57328 +               screen_info.orig_video_ega_bx = 3;
57329 +               screen_info.orig_video_isVGA = 1;
57330 +               screen_info.orig_video_lines = info->u.text_mode_3.rows;
57331 +               screen_info.orig_video_cols = info->u.text_mode_3.columns;
57332 +               screen_info.orig_x = info->u.text_mode_3.cursor_x;
57333 +               screen_info.orig_y = info->u.text_mode_3.cursor_y;
57334 +               screen_info.orig_video_points =
57335 +                       info->u.text_mode_3.font_height;
57336 +               break;
57337 +       case XEN_VGATYPE_VESA_LFB:
57338 +               screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
57339 +               screen_info.lfb_width = info->u.vesa_lfb.width;
57340 +               screen_info.lfb_height = info->u.vesa_lfb.height;
57341 +               screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel;
57342 +               screen_info.lfb_base = info->u.vesa_lfb.lfb_base;
57343 +               screen_info.lfb_size = info->u.vesa_lfb.lfb_size;
57344 +               screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line;
57345 +               screen_info.red_size = info->u.vesa_lfb.red_size;
57346 +               screen_info.red_pos = info->u.vesa_lfb.red_pos;
57347 +               screen_info.green_size = info->u.vesa_lfb.green_size;
57348 +               screen_info.green_pos = info->u.vesa_lfb.green_pos;
57349 +               screen_info.blue_size = info->u.vesa_lfb.blue_size;
57350 +               screen_info.blue_pos = info->u.vesa_lfb.blue_pos;
57351 +               screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size;
57352 +               screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos;
57353 +               break;
57354 +       }
57355 +}
57356 +
57357 +
57358 +/******************** User-space console driver (/dev/console) ************/
57359 +
57360 +#define DRV(_d)         (_d)
57361 +#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) &&                \
57362 +                        ((_tty)->index != (xc_num - 1)))
57363 +
57364 +static struct termios *xencons_termios[MAX_NR_CONSOLES];
57365 +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
57366 +static struct tty_struct *xencons_tty;
57367 +static int xencons_priv_irq;
57368 +static char x_char;
57369 +
57370 +void xencons_rx(char *buf, unsigned len)
57371 +{
57372 +       int           i;
57373 +       unsigned long flags;
57374 +
57375 +       spin_lock_irqsave(&xencons_lock, flags);
57376 +       if (xencons_tty == NULL)
57377 +               goto out;
57378 +
57379 +       for (i = 0; i < len; i++) {
57380 +#ifdef CONFIG_MAGIC_SYSRQ
57381 +               if (sysrq_enabled) {
57382 +                       if (buf[i] == '\x0f') { /* ^O */
57383 +                               sysrq_requested = jiffies;
57384 +                               continue; /* don't print the sysrq key */
57385 +                       } else if (sysrq_requested) {
57386 +                               unsigned long sysrq_timeout =
57387 +                                       sysrq_requested + HZ*2;
57388 +                               sysrq_requested = 0;
57389 +                               if (time_before(jiffies, sysrq_timeout)) {
57390 +                                       spin_unlock_irqrestore(
57391 +                                               &xencons_lock, flags);
57392 +                                       handle_sysrq(
57393 +                                               buf[i], xencons_tty);
57394 +                                       spin_lock_irqsave(
57395 +                                               &xencons_lock, flags);
57396 +                                       continue;
57397 +                               }
57398 +                       }
57399 +               }
57400 +#endif
57401 +               tty_insert_flip_char(xencons_tty, buf[i], 0);
57402 +       }
57403 +       tty_flip_buffer_push(xencons_tty);
57404 +
57405 + out:
57406 +       spin_unlock_irqrestore(&xencons_lock, flags);
57407 +}
57408 +
57409 +static void __xencons_tx_flush(void)
57410 +{
57411 +       int sent, sz, work_done = 0;
57412 +
57413 +       if (x_char) {
57414 +               if (is_initial_xendomain())
57415 +                       kcons_write_dom0(NULL, &x_char, 1);
57416 +               else
57417 +                       while (x_char)
57418 +                               if (xencons_ring_send(&x_char, 1) == 1)
57419 +                                       break;
57420 +               x_char = 0;
57421 +               work_done = 1;
57422 +       }
57423 +
57424 +       while (wc != wp) {
57425 +               sz = wp - wc;
57426 +               if (sz > (wbuf_size - WBUF_MASK(wc)))
57427 +                       sz = wbuf_size - WBUF_MASK(wc);
57428 +               if (is_initial_xendomain()) {
57429 +                       kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
57430 +                       wc += sz;
57431 +               } else {
57432 +                       sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
57433 +                       if (sent == 0)
57434 +                               break;
57435 +                       wc += sent;
57436 +               }
57437 +               work_done = 1;
57438 +       }
57439 +
57440 +       if (work_done && (xencons_tty != NULL)) {
57441 +               wake_up_interruptible(&xencons_tty->write_wait);
57442 +               if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
57443 +                   (xencons_tty->ldisc.write_wakeup != NULL))
57444 +                       (xencons_tty->ldisc.write_wakeup)(xencons_tty);
57445 +       }
57446 +}
57447 +
57448 +void xencons_tx(void)
57449 +{
57450 +       unsigned long flags;
57451 +
57452 +       spin_lock_irqsave(&xencons_lock, flags);
57453 +       __xencons_tx_flush();
57454 +       spin_unlock_irqrestore(&xencons_lock, flags);
57455 +}
57456 +
57457 +/* Privileged receive callback and transmit kicker. */
57458 +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
57459 +{
57460 +       static char rbuf[16];
57461 +       int         l;
57462 +
57463 +       while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
57464 +               xencons_rx(rbuf, l);
57465 +
57466 +       xencons_tx();
57467 +
57468 +       return IRQ_HANDLED;
57469 +}
57470 +
57471 +static int xencons_write_room(struct tty_struct *tty)
57472 +{
57473 +       return wbuf_size - (wp - wc);
57474 +}
57475 +
57476 +static int xencons_chars_in_buffer(struct tty_struct *tty)
57477 +{
57478 +       return wp - wc;
57479 +}
57480 +
57481 +static void xencons_send_xchar(struct tty_struct *tty, char ch)
57482 +{
57483 +       unsigned long flags;
57484 +
57485 +       if (DUMMY_TTY(tty))
57486 +               return;
57487 +
57488 +       spin_lock_irqsave(&xencons_lock, flags);
57489 +       x_char = ch;
57490 +       __xencons_tx_flush();
57491 +       spin_unlock_irqrestore(&xencons_lock, flags);
57492 +}
57493 +
57494 +static void xencons_throttle(struct tty_struct *tty)
57495 +{
57496 +       if (DUMMY_TTY(tty))
57497 +               return;
57498 +
57499 +       if (I_IXOFF(tty))
57500 +               xencons_send_xchar(tty, STOP_CHAR(tty));
57501 +}
57502 +
57503 +static void xencons_unthrottle(struct tty_struct *tty)
57504 +{
57505 +       if (DUMMY_TTY(tty))
57506 +               return;
57507 +
57508 +       if (I_IXOFF(tty)) {
57509 +               if (x_char != 0)
57510 +                       x_char = 0;
57511 +               else
57512 +                       xencons_send_xchar(tty, START_CHAR(tty));
57513 +       }
57514 +}
57515 +
57516 +static void xencons_flush_buffer(struct tty_struct *tty)
57517 +{
57518 +       unsigned long flags;
57519 +
57520 +       if (DUMMY_TTY(tty))
57521 +               return;
57522 +
57523 +       spin_lock_irqsave(&xencons_lock, flags);
57524 +       wc = wp = 0;
57525 +       spin_unlock_irqrestore(&xencons_lock, flags);
57526 +}
57527 +
57528 +static inline int __xencons_put_char(int ch)
57529 +{
57530 +       char _ch = (char)ch;
57531 +       if ((wp - wc) == wbuf_size)
57532 +               return 0;
57533 +       wbuf[WBUF_MASK(wp++)] = _ch;
57534 +       return 1;
57535 +}
57536 +
57537 +static int xencons_write(
57538 +       struct tty_struct *tty,
57539 +       const unsigned char *buf,
57540 +       int count)
57541 +{
57542 +       int i;
57543 +       unsigned long flags;
57544 +
57545 +       if (DUMMY_TTY(tty))
57546 +               return count;
57547 +
57548 +       spin_lock_irqsave(&xencons_lock, flags);
57549 +
57550 +       for (i = 0; i < count; i++)
57551 +               if (!__xencons_put_char(buf[i]))
57552 +                       break;
57553 +
57554 +       if (i != 0)
57555 +               __xencons_tx_flush();
57556 +
57557 +       spin_unlock_irqrestore(&xencons_lock, flags);
57558 +
57559 +       return i;
57560 +}
57561 +
57562 +static void xencons_put_char(struct tty_struct *tty, u_char ch)
57563 +{
57564 +       unsigned long flags;
57565 +
57566 +       if (DUMMY_TTY(tty))
57567 +               return;
57568 +
57569 +       spin_lock_irqsave(&xencons_lock, flags);
57570 +       (void)__xencons_put_char(ch);
57571 +       spin_unlock_irqrestore(&xencons_lock, flags);
57572 +}
57573 +
57574 +static void xencons_flush_chars(struct tty_struct *tty)
57575 +{
57576 +       unsigned long flags;
57577 +
57578 +       if (DUMMY_TTY(tty))
57579 +               return;
57580 +
57581 +       spin_lock_irqsave(&xencons_lock, flags);
57582 +       __xencons_tx_flush();
57583 +       spin_unlock_irqrestore(&xencons_lock, flags);
57584 +}
57585 +
57586 +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
57587 +{
57588 +       unsigned long orig_jiffies = jiffies;
57589 +
57590 +       if (DUMMY_TTY(tty))
57591 +               return;
57592 +
57593 +       while (DRV(tty->driver)->chars_in_buffer(tty)) {
57594 +               set_current_state(TASK_INTERRUPTIBLE);
57595 +               schedule_timeout(1);
57596 +               if (signal_pending(current))
57597 +                       break;
57598 +               if (timeout && time_after(jiffies, orig_jiffies + timeout))
57599 +                       break;
57600 +       }
57601 +
57602 +       set_current_state(TASK_RUNNING);
57603 +}
57604 +
57605 +static int xencons_open(struct tty_struct *tty, struct file *filp)
57606 +{
57607 +       unsigned long flags;
57608 +
57609 +       if (DUMMY_TTY(tty))
57610 +               return 0;
57611 +
57612 +       spin_lock_irqsave(&xencons_lock, flags);
57613 +       tty->driver_data = NULL;
57614 +       if (xencons_tty == NULL)
57615 +               xencons_tty = tty;
57616 +       __xencons_tx_flush();
57617 +       spin_unlock_irqrestore(&xencons_lock, flags);
57618 +
57619 +       return 0;
57620 +}
57621 +
57622 +static void xencons_close(struct tty_struct *tty, struct file *filp)
57623 +{
57624 +       unsigned long flags;
57625 +
57626 +       if (DUMMY_TTY(tty))
57627 +               return;
57628 +
57629 +       mutex_lock(&tty_mutex);
57630 +
57631 +       if (tty->count != 1) {
57632 +               mutex_unlock(&tty_mutex);
57633 +               return;
57634 +       }
57635 +
57636 +       /* Prevent other threads from re-opening this tty. */
57637 +       set_bit(TTY_CLOSING, &tty->flags);
57638 +       mutex_unlock(&tty_mutex);
57639 +
57640 +       tty->closing = 1;
57641 +       tty_wait_until_sent(tty, 0);
57642 +       if (DRV(tty->driver)->flush_buffer != NULL)
57643 +               DRV(tty->driver)->flush_buffer(tty);
57644 +       if (tty->ldisc.flush_buffer != NULL)
57645 +               tty->ldisc.flush_buffer(tty);
57646 +       tty->closing = 0;
57647 +       spin_lock_irqsave(&xencons_lock, flags);
57648 +       xencons_tty = NULL;
57649 +       spin_unlock_irqrestore(&xencons_lock, flags);
57650 +}
57651 +
57652 +static struct tty_operations xencons_ops = {
57653 +       .open = xencons_open,
57654 +       .close = xencons_close,
57655 +       .write = xencons_write,
57656 +       .write_room = xencons_write_room,
57657 +       .put_char = xencons_put_char,
57658 +       .flush_chars = xencons_flush_chars,
57659 +       .chars_in_buffer = xencons_chars_in_buffer,
57660 +       .send_xchar = xencons_send_xchar,
57661 +       .flush_buffer = xencons_flush_buffer,
57662 +       .throttle = xencons_throttle,
57663 +       .unthrottle = xencons_unthrottle,
57664 +       .wait_until_sent = xencons_wait_until_sent,
57665 +};
57666 +
57667 +static int __init xencons_init(void)
57668 +{
57669 +       int rc;
57670 +
57671 +       if (!is_running_on_xen())
57672 +               return -ENODEV;
57673 +
57674 +       if (xc_mode == XC_OFF)
57675 +               return 0;
57676 +
57677 +       if (!is_initial_xendomain()) {
57678 +               rc = xencons_ring_init();
57679 +               if (rc)
57680 +                       return rc;
57681 +       }
57682 +
57683 +       xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ?
57684 +                                         MAX_NR_CONSOLES : 1);
57685 +       if (xencons_driver == NULL)
57686 +               return -ENOMEM;
57687 +
57688 +       DRV(xencons_driver)->name            = "xencons";
57689 +       DRV(xencons_driver)->major           = TTY_MAJOR;
57690 +       DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
57691 +       DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
57692 +       DRV(xencons_driver)->init_termios    = tty_std_termios;
57693 +       DRV(xencons_driver)->flags           =
57694 +               TTY_DRIVER_REAL_RAW |
57695 +               TTY_DRIVER_RESET_TERMIOS;
57696 +       DRV(xencons_driver)->termios         = xencons_termios;
57697 +       DRV(xencons_driver)->termios_locked  = xencons_termios_locked;
57698 +
57699 +       switch (xc_mode) {
57700 +       case XC_XVC:
57701 +               DRV(xencons_driver)->name        = "xvc";
57702 +               DRV(xencons_driver)->major       = XEN_XVC_MAJOR;
57703 +               DRV(xencons_driver)->minor_start = XEN_XVC_MINOR;
57704 +               DRV(xencons_driver)->name_base   = xc_num;
57705 +               break;
57706 +       case XC_SERIAL:
57707 +               DRV(xencons_driver)->name        = "ttyS";
57708 +               DRV(xencons_driver)->minor_start = 64 + xc_num;
57709 +               DRV(xencons_driver)->name_base   = xc_num;
57710 +               break;
57711 +       default:
57712 +               DRV(xencons_driver)->name        = "tty";
57713 +               DRV(xencons_driver)->minor_start = 1;
57714 +               DRV(xencons_driver)->name_base   = 1;
57715 +               break;
57716 +       }
57717 +
57718 +       tty_set_operations(xencons_driver, &xencons_ops);
57719 +
57720 +       if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
57721 +               printk("WARNING: Failed to register Xen virtual "
57722 +                      "console driver as '%s%d'\n",
57723 +                      DRV(xencons_driver)->name,
57724 +                      DRV(xencons_driver)->name_base);
57725 +               put_tty_driver(xencons_driver);
57726 +               xencons_driver = NULL;
57727 +               return rc;
57728 +       }
57729 +
57730 +       if (is_initial_xendomain()) {
57731 +               xencons_priv_irq = bind_virq_to_irqhandler(
57732 +                       VIRQ_CONSOLE,
57733 +                       0,
57734 +                       xencons_priv_interrupt,
57735 +                       0,
57736 +                       "console",
57737 +                       NULL);
57738 +               BUG_ON(xencons_priv_irq < 0);
57739 +       }
57740 +
57741 +       printk("Xen virtual console successfully installed as %s%d\n",
57742 +              DRV(xencons_driver)->name, xc_num);
57743 +
57744 +       return 0;
57745 +}
57746 +
57747 +module_init(xencons_init);
57748 +
57749 +MODULE_LICENSE("Dual BSD/GPL");
57750 diff -ruNp linux-2.6.19/drivers/xen/console/xencons_ring.c linux-2.6.19-xen-3.0.4/drivers/xen/console/xencons_ring.c
57751 --- linux-2.6.19/drivers/xen/console/xencons_ring.c     1970-01-01 00:00:00.000000000 +0000
57752 +++ linux-2.6.19-xen-3.0.4/drivers/xen/console/xencons_ring.c   2007-02-02 19:10:45.000000000 +0000
57753 @@ -0,0 +1,143 @@
57754 +/* 
57755 + * This program is free software; you can redistribute it and/or
57756 + * modify it under the terms of the GNU General Public License version 2
57757 + * as published by the Free Software Foundation; or, when distributed
57758 + * separately from the Linux kernel or incorporated into other
57759 + * software packages, subject to the following license:
57760 + * 
57761 + * Permission is hereby granted, free of charge, to any person obtaining a copy
57762 + * of this source file (the "Software"), to deal in the Software without
57763 + * restriction, including without limitation the rights to use, copy, modify,
57764 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57765 + * and to permit persons to whom the Software is furnished to do so, subject to
57766 + * the following conditions:
57767 + * 
57768 + * The above copyright notice and this permission notice shall be included in
57769 + * all copies or substantial portions of the Software.
57770 + * 
57771 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57772 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57773 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57774 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57775 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57776 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57777 + * IN THE SOFTWARE.
57778 + */
57779 +
57780 +#include <linux/version.h>
57781 +#include <linux/module.h>
57782 +#include <linux/errno.h>
57783 +#include <linux/signal.h>
57784 +#include <linux/sched.h>
57785 +#include <linux/interrupt.h>
57786 +#include <linux/tty.h>
57787 +#include <linux/tty_flip.h>
57788 +#include <linux/serial.h>
57789 +#include <linux/major.h>
57790 +#include <linux/ptrace.h>
57791 +#include <linux/ioport.h>
57792 +#include <linux/mm.h>
57793 +#include <linux/slab.h>
57794 +
57795 +#include <asm/hypervisor.h>
57796 +#include <xen/evtchn.h>
57797 +#include <xen/xencons.h>
57798 +#include <linux/wait.h>
57799 +#include <linux/interrupt.h>
57800 +#include <linux/sched.h>
57801 +#include <linux/err.h>
57802 +#include <xen/interface/io/console.h>
57803 +
57804 +static int xencons_irq;
57805 +
57806 +static inline struct xencons_interface *xencons_interface(void)
57807 +{
57808 +       return mfn_to_virt(xen_start_info->console.domU.mfn);
57809 +}
57810 +
57811 +static inline void notify_daemon(void)
57812 +{
57813 +       /* Use evtchn: this is called early, before irq is set up. */
57814 +       notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
57815 +}
57816 +
57817 +int xencons_ring_send(const char *data, unsigned len)
57818 +{
57819 +       int sent = 0;
57820 +       struct xencons_interface *intf = xencons_interface();
57821 +       XENCONS_RING_IDX cons, prod;
57822 +
57823 +       cons = intf->out_cons;
57824 +       prod = intf->out_prod;
57825 +       mb();
57826 +       BUG_ON((prod - cons) > sizeof(intf->out));
57827 +
57828 +       while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
57829 +               intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
57830 +
57831 +       wmb();
57832 +       intf->out_prod = prod;
57833 +
57834 +       notify_daemon();
57835 +
57836 +       return sent;
57837 +}
57838 +
57839 +static irqreturn_t handle_input(int irq, void *unused)
57840 +{
57841 +       struct xencons_interface *intf = xencons_interface();
57842 +       XENCONS_RING_IDX cons, prod;
57843 +
57844 +       cons = intf->in_cons;
57845 +       prod = intf->in_prod;
57846 +       mb();
57847 +       BUG_ON((prod - cons) > sizeof(intf->in));
57848 +
57849 +       while (cons != prod) {
57850 +               xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
57851 +               cons++;
57852 +       }
57853 +
57854 +       mb();
57855 +       intf->in_cons = cons;
57856 +
57857 +       notify_daemon();
57858 +
57859 +       xencons_tx();
57860 +
57861 +       return IRQ_HANDLED;
57862 +}
57863 +
57864 +int xencons_ring_init(void)
57865 +{
57866 +       int irq;
57867 +
57868 +       if (xencons_irq)
57869 +               unbind_from_irqhandler(xencons_irq, NULL);
57870 +       xencons_irq = 0;
57871 +
57872 +       if (!is_running_on_xen() ||
57873 +           is_initial_xendomain() ||
57874 +           !xen_start_info->console.domU.evtchn)
57875 +               return -ENODEV;
57876 +
57877 +       irq = bind_evtchn_to_irqhandler(
57878 +               xen_start_info->console.domU.evtchn,
57879 +               handle_input, 0, "xencons", NULL);
57880 +       if (irq < 0) {
57881 +               printk(KERN_ERR "XEN console request irq failed %i\n", irq);
57882 +               return irq;
57883 +       }
57884 +
57885 +       xencons_irq = irq;
57886 +
57887 +       /* In case we have in-flight data after save/restore... */
57888 +       notify_daemon();
57889 +
57890 +       return 0;
57891 +}
57892 +
57893 +void xencons_resume(void)
57894 +{
57895 +       (void)xencons_ring_init();
57896 +}
57897 diff -ruNp linux-2.6.19/drivers/xen/core/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/core/Makefile
57898 --- linux-2.6.19/drivers/xen/core/Makefile      1970-01-01 00:00:00.000000000 +0000
57899 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/Makefile    2007-02-02 19:10:45.000000000 +0000
57900 @@ -0,0 +1,14 @@
57901 +#
57902 +# Makefile for the linux kernel.
57903 +#
57904 +
57905 +obj-y := evtchn.o gnttab.o features.o
57906 +
57907 +obj-$(CONFIG_PROC_FS)          += xen_proc.o
57908 +obj-$(CONFIG_SYSFS)            += hypervisor_sysfs.o
57909 +obj-$(CONFIG_HOTPLUG_CPU)      += cpu_hotplug.o
57910 +obj-$(CONFIG_XEN_SYSFS)                += xen_sysfs.o
57911 +obj-$(CONFIG_XEN_SKBUFF)       += skbuff.o
57912 +obj-$(CONFIG_XEN_REBOOT)       += reboot.o machine_reboot.o
57913 +obj-$(CONFIG_XEN_SMPBOOT)      += smpboot.o
57914 +obj-$(CONFIG_KEXEC)            += machine_kexec.o
57915 diff -ruNp linux-2.6.19/drivers/xen/core/cpu_hotplug.c linux-2.6.19-xen-3.0.4/drivers/xen/core/cpu_hotplug.c
57916 --- linux-2.6.19/drivers/xen/core/cpu_hotplug.c 1970-01-01 00:00:00.000000000 +0000
57917 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/cpu_hotplug.c       2007-02-02 19:10:45.000000000 +0000
57918 @@ -0,0 +1,188 @@
57919 +
57920 +#include <linux/init.h>
57921 +#include <linux/kernel.h>
57922 +#include <linux/sched.h>
57923 +#include <linux/notifier.h>
57924 +#include <linux/cpu.h>
57925 +#include <xen/cpu_hotplug.h>
57926 +#include <xen/xenbus.h>
57927 +
57928 +/*
57929 + * Set of CPUs that remote admin software will allow us to bring online.
57930 + * Notified to us via xenbus.
57931 + */
57932 +static cpumask_t xenbus_allowed_cpumask;
57933 +
57934 +/* Set of CPUs that local admin will allow us to bring online. */
57935 +static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
57936 +
57937 +static int local_cpu_hotplug_request(void)
57938 +{
57939 +       /*
57940 +        * We assume a CPU hotplug request comes from local admin if it is made
57941 +        * via a userspace process (i.e., one with a real mm_struct).
57942 +        */
57943 +       return (current->mm != NULL);
57944 +}
57945 +
57946 +static void vcpu_hotplug(unsigned int cpu)
57947 +{
57948 +       int err;
57949 +       char dir[32], state[32];
57950 +
57951 +       if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
57952 +               return;
57953 +
57954 +       sprintf(dir, "cpu/%d", cpu);
57955 +       err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
57956 +       if (err != 1) {
57957 +               printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
57958 +               return;
57959 +       }
57960 +
57961 +       if (strcmp(state, "online") == 0) {
57962 +               cpu_set(cpu, xenbus_allowed_cpumask);
57963 +               (void)cpu_up(cpu);
57964 +       } else if (strcmp(state, "offline") == 0) {
57965 +               cpu_clear(cpu, xenbus_allowed_cpumask);
57966 +               (void)cpu_down(cpu);
57967 +       } else {
57968 +               printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
57969 +                      state, cpu);
57970 +       }
57971 +}
57972 +
57973 +static void handle_vcpu_hotplug_event(
57974 +       struct xenbus_watch *watch, const char **vec, unsigned int len)
57975 +{
57976 +       int cpu;
57977 +       char *cpustr;
57978 +       const char *node = vec[XS_WATCH_PATH];
57979 +
57980 +       if ((cpustr = strstr(node, "cpu/")) != NULL) {
57981 +               sscanf(cpustr, "cpu/%d", &cpu);
57982 +               vcpu_hotplug(cpu);
57983 +       }
57984 +}
57985 +
57986 +static int smpboot_cpu_notify(struct notifier_block *notifier,
57987 +                             unsigned long action, void *hcpu)
57988 +{
57989 +       int cpu = (long)hcpu;
57990 +
57991 +       /*
57992 +        * We do this in a callback notifier rather than __cpu_disable()
57993 +        * because local_cpu_hotplug_request() does not work in the latter
57994 +        * as it's always executed from within a stopmachine kthread.
57995 +        */
57996 +       if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
57997 +               cpu_clear(cpu, local_allowed_cpumask);
57998 +
57999 +       return NOTIFY_OK;
58000 +}
58001 +
58002 +static int setup_cpu_watcher(struct notifier_block *notifier,
58003 +                             unsigned long event, void *data)
58004 +{
58005 +       int i;
58006 +
58007 +       static struct xenbus_watch cpu_watch = {
58008 +               .node = "cpu",
58009 +               .callback = handle_vcpu_hotplug_event,
58010 +               .flags = XBWF_new_thread };
58011 +       (void)register_xenbus_watch(&cpu_watch);
58012 +
58013 +       if (!is_initial_xendomain()) {
58014 +               for_each_possible_cpu(i)
58015 +                       vcpu_hotplug(i);
58016 +               printk(KERN_INFO "Brought up %ld CPUs\n",
58017 +                      (long)num_online_cpus());
58018 +       }
58019 +
58020 +       return NOTIFY_DONE;
58021 +}
58022 +
58023 +static int __init setup_vcpu_hotplug_event(void)
58024 +{
58025 +       static struct notifier_block hotplug_cpu = {
58026 +               .notifier_call = smpboot_cpu_notify };
58027 +       static struct notifier_block xsn_cpu = {
58028 +               .notifier_call = setup_cpu_watcher };
58029 +
58030 +       if (!is_running_on_xen())
58031 +               return -ENODEV;
58032 +
58033 +       register_cpu_notifier(&hotplug_cpu);
58034 +       register_xenstore_notifier(&xsn_cpu);
58035 +
58036 +       return 0;
58037 +}
58038 +
58039 +arch_initcall(setup_vcpu_hotplug_event);
58040 +
58041 +int smp_suspend(void)
58042 +{
58043 +       int i, err;
58044 +
58045 +       lock_cpu_hotplug();
58046 +
58047 +       /*
58048 +        * Take all other CPUs offline. We hold the hotplug mutex to
58049 +        * avoid other processes bringing up CPUs under our feet.
58050 +        */
58051 +       while (num_online_cpus() > 1) {
58052 +               unlock_cpu_hotplug();
58053 +               for_each_online_cpu(i) {
58054 +                       if (i == 0)
58055 +                               continue;
58056 +                       err = cpu_down(i);
58057 +                       if (err) {
58058 +                               printk(KERN_CRIT "Failed to take all CPUs "
58059 +                                      "down: %d.\n", err);
58060 +                               for_each_possible_cpu(i)
58061 +                                       vcpu_hotplug(i);
58062 +                               return err;
58063 +                       }
58064 +               }
58065 +               lock_cpu_hotplug();
58066 +       }
58067 +
58068 +       return 0;
58069 +}
58070 +
58071 +void smp_resume(void)
58072 +{
58073 +       int cpu;
58074 +
58075 +       for_each_possible_cpu(cpu)
58076 +               cpu_initialize_context(cpu);
58077 +
58078 +       unlock_cpu_hotplug();
58079 +
58080 +       for_each_possible_cpu(cpu)
58081 +               vcpu_hotplug(cpu);
58082 +}
58083 +
58084 +int cpu_up_check(unsigned int cpu)
58085 +{
58086 +       int rc = 0;
58087 +
58088 +       if (local_cpu_hotplug_request()) {
58089 +               cpu_set(cpu, local_allowed_cpumask);
58090 +               if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
58091 +                       printk("%s: attempt to bring up CPU %u disallowed by "
58092 +                              "remote admin.\n", __FUNCTION__, cpu);
58093 +                       rc = -EBUSY;
58094 +               }
58095 +       } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
58096 +                  !cpu_isset(cpu, xenbus_allowed_cpumask)) {
58097 +               rc = -EBUSY;
58098 +       }
58099 +
58100 +       return rc;
58101 +}
58102 +
58103 +void init_xenbus_allowed_cpumask(void)
58104 +{
58105 +       xenbus_allowed_cpumask = cpu_present_map;
58106 +}
58107 diff -ruNp linux-2.6.19/drivers/xen/core/evtchn.c linux-2.6.19-xen-3.0.4/drivers/xen/core/evtchn.c
58108 --- linux-2.6.19/drivers/xen/core/evtchn.c      1970-01-01 00:00:00.000000000 +0000
58109 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/evtchn.c    2007-02-02 19:10:45.000000000 +0000
58110 @@ -0,0 +1,861 @@
58111 +/******************************************************************************
58112 + * evtchn.c
58113 + * 
58114 + * Communication via Xen event channels.
58115 + * 
58116 + * Copyright (c) 2002-2005, K A Fraser
58117 + * 
58118 + * This program is free software; you can redistribute it and/or
58119 + * modify it under the terms of the GNU General Public License version 2
58120 + * as published by the Free Software Foundation; or, when distributed
58121 + * separately from the Linux kernel or incorporated into other
58122 + * software packages, subject to the following license:
58123 + * 
58124 + * Permission is hereby granted, free of charge, to any person obtaining a copy
58125 + * of this source file (the "Software"), to deal in the Software without
58126 + * restriction, including without limitation the rights to use, copy, modify,
58127 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
58128 + * and to permit persons to whom the Software is furnished to do so, subject to
58129 + * the following conditions:
58130 + * 
58131 + * The above copyright notice and this permission notice shall be included in
58132 + * all copies or substantial portions of the Software.
58133 + * 
58134 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58135 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
58136 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58137 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58138 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
58139 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
58140 + * IN THE SOFTWARE.
58141 + */
58142 +
58143 +#include <linux/module.h>
58144 +#include <linux/irq.h>
58145 +#include <linux/interrupt.h>
58146 +#include <linux/sched.h>
58147 +#include <linux/kernel_stat.h>
58148 +#include <linux/version.h>
58149 +#include <asm/atomic.h>
58150 +#include <asm/system.h>
58151 +#include <asm/ptrace.h>
58152 +#include <asm/synch_bitops.h>
58153 +#include <xen/evtchn.h>
58154 +#include <xen/interface/event_channel.h>
58155 +#include <xen/interface/physdev.h>
58156 +#include <asm/hypervisor.h>
58157 +#include <linux/mc146818rtc.h> /* RTC_IRQ */
58158 +
58159 +/*
58160 + * This lock protects updates to the following mapping and reference-count
58161 + * arrays. The lock does not need to be acquired to read the mapping tables.
58162 + */
58163 +static DEFINE_SPINLOCK(irq_mapping_update_lock);
58164 +
58165 +/* IRQ <-> event-channel mappings. */
58166 +static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
58167 +       [0 ...  NR_EVENT_CHANNELS-1] = -1 };
58168 +
58169 +/* Packed IRQ information: binding type, sub-type index, and event channel. */
58170 +static u32 irq_info[NR_IRQS];
58171 +
58172 +/* Binding types. */
58173 +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
58174 +
58175 +/* Constructor for packed IRQ information. */
58176 +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
58177 +{
58178 +       return ((type << 24) | (index << 16) | evtchn);
58179 +}
58180 +
58181 +/* Convenient shorthand for packed representation of an unbound IRQ. */
58182 +#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
58183 +
58184 +/*
58185 + * Accessors for packed IRQ information.
58186 + */
58187 +
58188 +static inline unsigned int evtchn_from_irq(int irq)
58189 +{
58190 +       return (u16)(irq_info[irq]);
58191 +}
58192 +
58193 +static inline unsigned int index_from_irq(int irq)
58194 +{
58195 +       return (u8)(irq_info[irq] >> 16);
58196 +}
58197 +
58198 +static inline unsigned int type_from_irq(int irq)
58199 +{
58200 +       return (u8)(irq_info[irq] >> 24);
58201 +}
58202 +
58203 +/* IRQ <-> VIRQ mapping. */
58204 +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
58205 +
58206 +/* IRQ <-> IPI mapping. */
58207 +#ifndef NR_IPIS
58208 +#define NR_IPIS 1
58209 +#endif
58210 +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
58211 +
58212 +/* Reference counts for bindings to IRQs. */
58213 +static int irq_bindcount[NR_IRQS];
58214 +
58215 +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
58216 +static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
58217 +
58218 +#ifdef CONFIG_SMP
58219 +
58220 +static u8 cpu_evtchn[NR_EVENT_CHANNELS];
58221 +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
58222 +
58223 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
58224 +                                          unsigned int idx)
58225 +{
58226 +       return (sh->evtchn_pending[idx] &
58227 +               cpu_evtchn_mask[cpu][idx] &
58228 +               ~sh->evtchn_mask[idx]);
58229 +}
58230 +
58231 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
58232 +{
58233 +       int irq = evtchn_to_irq[chn];
58234 +
58235 +       BUG_ON(irq == -1);
58236 +       set_native_irq_info(irq, cpumask_of_cpu(cpu));
58237 +
58238 +       clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
58239 +       set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
58240 +       cpu_evtchn[chn] = cpu;
58241 +}
58242 +
58243 +static void init_evtchn_cpu_bindings(void)
58244 +{
58245 +       int i;
58246 +
58247 +       /* By default all event channels notify CPU#0. */
58248 +       for (i = 0; i < NR_IRQS; i++)
58249 +               set_native_irq_info(i, cpumask_of_cpu(0));
58250 +
58251 +       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
58252 +       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
58253 +}
58254 +
58255 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
58256 +{
58257 +       return cpu_evtchn[evtchn];
58258 +}
58259 +
58260 +#else
58261 +
58262 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
58263 +                                          unsigned int idx)
58264 +{
58265 +       return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
58266 +}
58267 +
58268 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
58269 +{
58270 +}
58271 +
58272 +static void init_evtchn_cpu_bindings(void)
58273 +{
58274 +}
58275 +
58276 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
58277 +{
58278 +       return 0;
58279 +}
58280 +
58281 +#endif
58282 +
58283 +/* Upcall to generic IRQ layer. */
58284 +#ifdef CONFIG_X86
58285 +extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
58286 +void __init xen_init_IRQ(void);
58287 +void __init init_IRQ(void)
58288 +{
58289 +       irq_ctx_init(0);
58290 +       xen_init_IRQ();
58291 +}
58292 +#if defined (__i386__)
58293 +static inline void exit_idle(void) {}
58294 +#define IRQ_REG orig_eax
58295 +#elif defined (__x86_64__)
58296 +#include <asm/idle.h>
58297 +#define IRQ_REG orig_rax
58298 +#endif
58299 +#define do_IRQ(irq, regs) do {         \
58300 +       (regs)->IRQ_REG = ~(irq);       \
58301 +       do_IRQ((regs));                 \
58302 +} while (0)
58303 +#endif
58304 +
58305 +/* Xen will never allocate port zero for any purpose. */
58306 +#define VALID_EVTCHN(chn)      ((chn) != 0)
58307 +
58308 +/*
58309 + * Force a proper event-channel callback from Xen after clearing the
58310 + * callback mask. We do this in a very simple manner, by making a call
58311 + * down into Xen. The pending flag will be checked by Xen on return.
58312 + */
58313 +void force_evtchn_callback(void)
58314 +{
58315 +       (void)HYPERVISOR_xen_version(0, NULL);
58316 +}
58317 +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
58318 +EXPORT_SYMBOL(force_evtchn_callback);
58319 +
58320 +/* NB. Interrupts are disabled on entry. */
58321 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
58322 +{
58323 +       unsigned long  l1, l2;
58324 +       unsigned int   l1i, l2i, port;
58325 +       int            irq, cpu = smp_processor_id();
58326 +       shared_info_t *s = HYPERVISOR_shared_info;
58327 +       vcpu_info_t   *vcpu_info = &s->vcpu_info[cpu];
58328 +
58329 +       vcpu_info->evtchn_upcall_pending = 0;
58330 +
58331 +#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
58332 +       /* Clear master pending flag /before/ clearing selector flag. */
58333 +       rmb();
58334 +#endif
58335 +       l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
58336 +       while (l1 != 0) {
58337 +               l1i = __ffs(l1);
58338 +               l1 &= ~(1UL << l1i);
58339 +
58340 +               while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
58341 +                       l2i = __ffs(l2);
58342 +
58343 +                       port = (l1i * BITS_PER_LONG) + l2i;
58344 +                       if ((irq = evtchn_to_irq[port]) != -1)
58345 +                               do_IRQ(irq, regs);
58346 +                       else {
58347 +                               exit_idle();
58348 +                               evtchn_device_upcall(port);
58349 +                       }
58350 +               }
58351 +       }
58352 +}
58353 +
58354 +static int find_unbound_irq(void)
58355 +{
58356 +       static int warned;
58357 +       int dynirq, irq;
58358 +
58359 +       for (dynirq = 0; dynirq < NR_DYNIRQS; dynirq++) {
58360 +               irq = dynirq_to_irq(dynirq);
58361 +               if (irq_bindcount[irq] == 0)
58362 +                       return irq;
58363 +       }
58364 +
58365 +       if (!warned) {
58366 +               warned = 1;
58367 +               printk(KERN_WARNING "No available IRQ to bind to: "
58368 +                      "increase NR_DYNIRQS.\n");
58369 +       }
58370 +
58371 +       return -ENOSPC;
58372 +}
58373 +
58374 +static int bind_evtchn_to_irq(unsigned int evtchn)
58375 +{
58376 +       int irq;
58377 +
58378 +       spin_lock(&irq_mapping_update_lock);
58379 +
58380 +       if ((irq = evtchn_to_irq[evtchn]) == -1) {
58381 +               if ((irq = find_unbound_irq()) < 0)
58382 +                       goto out;
58383 +
58384 +               evtchn_to_irq[evtchn] = irq;
58385 +               irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
58386 +       }
58387 +
58388 +       irq_bindcount[irq]++;
58389 +
58390 + out:
58391 +       spin_unlock(&irq_mapping_update_lock);
58392 +       return irq;
58393 +}
58394 +
58395 +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
58396 +{
58397 +       struct evtchn_bind_virq bind_virq;
58398 +       int evtchn, irq;
58399 +
58400 +       spin_lock(&irq_mapping_update_lock);
58401 +
58402 +       if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
58403 +               if ((irq = find_unbound_irq()) < 0)
58404 +                       goto out;
58405 +
58406 +               bind_virq.virq = virq;
58407 +               bind_virq.vcpu = cpu;
58408 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
58409 +                                               &bind_virq) != 0)
58410 +                       BUG();
58411 +               evtchn = bind_virq.port;
58412 +
58413 +               evtchn_to_irq[evtchn] = irq;
58414 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
58415 +
58416 +               per_cpu(virq_to_irq, cpu)[virq] = irq;
58417 +
58418 +               bind_evtchn_to_cpu(evtchn, cpu);
58419 +       }
58420 +
58421 +       irq_bindcount[irq]++;
58422 +
58423 + out:
58424 +       spin_unlock(&irq_mapping_update_lock);
58425 +       return irq;
58426 +}
58427 +
58428 +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
58429 +{
58430 +       struct evtchn_bind_ipi bind_ipi;
58431 +       int evtchn, irq;
58432 +
58433 +       spin_lock(&irq_mapping_update_lock);
58434 +
58435 +       if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
58436 +               if ((irq = find_unbound_irq()) < 0)
58437 +                       goto out;
58438 +
58439 +               bind_ipi.vcpu = cpu;
58440 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
58441 +                                               &bind_ipi) != 0)
58442 +                       BUG();
58443 +               evtchn = bind_ipi.port;
58444 +
58445 +               evtchn_to_irq[evtchn] = irq;
58446 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
58447 +
58448 +               per_cpu(ipi_to_irq, cpu)[ipi] = irq;
58449 +
58450 +               bind_evtchn_to_cpu(evtchn, cpu);
58451 +       }
58452 +
58453 +       irq_bindcount[irq]++;
58454 +
58455 + out:
58456 +       spin_unlock(&irq_mapping_update_lock);
58457 +       return irq;
58458 +}
58459 +
58460 +static void unbind_from_irq(unsigned int irq)
58461 +{
58462 +       struct evtchn_close close;
58463 +       int evtchn = evtchn_from_irq(irq);
58464 +
58465 +       spin_lock(&irq_mapping_update_lock);
58466 +
58467 +       if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
58468 +               close.port = evtchn;
58469 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
58470 +                       BUG();
58471 +
58472 +               switch (type_from_irq(irq)) {
58473 +               case IRQT_VIRQ:
58474 +                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
58475 +                               [index_from_irq(irq)] = -1;
58476 +                       break;
58477 +               case IRQT_IPI:
58478 +                       per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
58479 +                               [index_from_irq(irq)] = -1;
58480 +                       break;
58481 +               default:
58482 +                       break;
58483 +               }
58484 +
58485 +               /* Closed ports are implicitly re-bound to VCPU0. */
58486 +               bind_evtchn_to_cpu(evtchn, 0);
58487 +
58488 +               evtchn_to_irq[evtchn] = -1;
58489 +               irq_info[irq] = IRQ_UNBOUND;
58490 +       }
58491 +
58492 +       spin_unlock(&irq_mapping_update_lock);
58493 +}
58494 +
58495 +int bind_evtchn_to_irqhandler(
58496 +       unsigned int evtchn,
58497 +       irq_handler_t handler,
58498 +       unsigned long irqflags,
58499 +       const char *devname,
58500 +       void *dev_id)
58501 +{
58502 +       unsigned int irq;
58503 +       int retval;
58504 +
58505 +       irq = bind_evtchn_to_irq(evtchn);
58506 +       if (irq < 0)
58507 +               return irq;
58508 +
58509 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
58510 +       if (retval != 0) {
58511 +               unbind_from_irq(irq);
58512 +               return retval;
58513 +       }
58514 +
58515 +       return irq;
58516 +}
58517 +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
58518 +
58519 +int bind_virq_to_irqhandler(
58520 +       unsigned int virq,
58521 +       unsigned int cpu,
58522 +       irq_handler_t handler,
58523 +       unsigned long irqflags,
58524 +       const char *devname,
58525 +       void *dev_id)
58526 +{
58527 +       unsigned int irq;
58528 +       int retval;
58529 +
58530 +       irq = bind_virq_to_irq(virq, cpu);
58531 +       if (irq < 0)
58532 +               return irq;
58533 +
58534 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
58535 +       if (retval != 0) {
58536 +               unbind_from_irq(irq);
58537 +               return retval;
58538 +       }
58539 +
58540 +       return irq;
58541 +}
58542 +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
58543 +
58544 +int bind_ipi_to_irqhandler(
58545 +       unsigned int ipi,
58546 +       unsigned int cpu,
58547 +       irq_handler_t handler,
58548 +       unsigned long irqflags,
58549 +       const char *devname,
58550 +       void *dev_id)
58551 +{
58552 +       unsigned int irq;
58553 +       int retval;
58554 +
58555 +       irq = bind_ipi_to_irq(ipi, cpu);
58556 +       if (irq < 0)
58557 +               return irq;
58558 +
58559 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
58560 +       if (retval != 0) {
58561 +               unbind_from_irq(irq);
58562 +               return retval;
58563 +       }
58564 +
58565 +       return irq;
58566 +}
58567 +EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
58568 +
58569 +void unbind_from_irqhandler(unsigned int irq, void *dev_id)
58570 +{
58571 +       free_irq(irq, dev_id);
58572 +       unbind_from_irq(irq);
58573 +}
58574 +EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
58575 +
58576 +/* Rebind an evtchn so that it gets delivered to a specific cpu */
58577 +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
58578 +{
58579 +       struct evtchn_bind_vcpu bind_vcpu;
58580 +       int evtchn = evtchn_from_irq(irq);
58581 +
58582 +       if (!VALID_EVTCHN(evtchn))
58583 +               return;
58584 +
58585 +       /* Send future instances of this interrupt to other vcpu. */
58586 +       bind_vcpu.port = evtchn;
58587 +       bind_vcpu.vcpu = tcpu;
58588 +
58589 +       /*
58590 +        * If this fails, it usually just indicates that we're dealing with a 
58591 +        * virq or IPI channel, which don't actually need to be rebound. Ignore
58592 +        * it, but don't do the xenlinux-level rebind in that case.
58593 +        */
58594 +       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
58595 +               bind_evtchn_to_cpu(evtchn, tcpu);
58596 +}
58597 +
58598 +
58599 +static void set_affinity_irq(unsigned irq, cpumask_t dest)
58600 +{
58601 +       unsigned tcpu = first_cpu(dest);
58602 +       rebind_irq_to_cpu(irq, tcpu);
58603 +}
58604 +
58605 +static int retrigger_vector(unsigned int irq)
58606 +{
58607 +       int evtchn = evtchn_from_irq(irq);
58608 +       shared_info_t *s = HYPERVISOR_shared_info;
58609 +       if (!VALID_EVTCHN(evtchn))
58610 +               return 1;
58611 +       BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
58612 +       synch_set_bit(evtchn, &s->evtchn_pending[0]);
58613 +       return 1;
58614 +}
58615 +
58616 +/*
58617 + * Interface to generic handling in irq.c
58618 + */
58619 +
58620 +static unsigned int startup_dynirq_vector(unsigned int irq)
58621 +{
58622 +       int evtchn = evtchn_from_irq(irq);
58623 +
58624 +       if (VALID_EVTCHN(evtchn))
58625 +               unmask_evtchn(evtchn);
58626 +       return 0;
58627 +}
58628 +
58629 +static void unmask_dynirq_vector(unsigned int irq)
58630 +{
58631 +       int evtchn = evtchn_from_irq(irq);
58632 +
58633 +       if (VALID_EVTCHN(evtchn))
58634 +               unmask_evtchn(evtchn);
58635 +}
58636 +
58637 +static void mask_dynirq_vector(unsigned int irq)
58638 +{
58639 +       int evtchn = evtchn_from_irq(irq);
58640 +
58641 +       if (VALID_EVTCHN(evtchn))
58642 +               mask_evtchn(evtchn);
58643 +}
58644 +
58645 +static void ack_dynirq_vector(unsigned int irq)
58646 +{
58647 +       int evtchn = evtchn_from_irq(irq);
58648 +
58649 +       move_native_irq(irq);
58650 +
58651 +       if (VALID_EVTCHN(evtchn)) {
58652 +               mask_evtchn(evtchn);
58653 +               clear_evtchn(evtchn);
58654 +       }
58655 +}
58656 +
58657 +static void ack_dynirq_quirk_vector(unsigned int irq)
58658 +{
58659 +       int evtchn = evtchn_from_irq(irq);
58660 +
58661 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
58662 +               unmask_evtchn(evtchn);
58663 +}
58664 +
58665 +static struct irq_chip dynirq_chip = {
58666 +       .name           = "Dynamic-irq",
58667 +       .startup        = startup_dynirq_vector,
58668 +       .mask           = mask_dynirq_vector,
58669 +       .unmask         = unmask_dynirq_vector,
58670 +       .ack            = ack_dynirq_vector,
58671 +       .eoi            = ack_dynirq_quirk_vector,
58672 +#ifdef CONFIG_SMP
58673 +       .set_affinity   = set_affinity_irq,
58674 +#endif
58675 +       .retrigger      = retrigger_vector,
58676 +};
58677 +
58678 +static inline void pirq_unmask_notify(int pirq)
58679 +{
58680 +       struct physdev_eoi eoi = { .irq = pirq };
58681 +       if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
58682 +               (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
58683 +}
58684 +
58685 +static inline void pirq_query_unmask(int pirq)
58686 +{
58687 +       struct physdev_irq_status_query irq_status;
58688 +       irq_status.irq = pirq;
58689 +       (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
58690 +       clear_bit(pirq, &pirq_needs_eoi[0]);
58691 +       if (irq_status.flags & XENIRQSTAT_needs_eoi)
58692 +               set_bit(pirq, &pirq_needs_eoi[0]);
58693 +}
58694 +
58695 +/*
58696 + * On startup, if there is no action associated with the IRQ then we are
58697 + * probing. In this case we should not share with others as it will confuse us.
58698 + */
58699 +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
58700 +
58701 +static unsigned int startup_pirq_vector(unsigned int irq)
58702 +{
58703 +       struct evtchn_bind_pirq bind_pirq;
58704 +       int evtchn = evtchn_from_irq(irq);
58705 +
58706 +       if (VALID_EVTCHN(evtchn))
58707 +               goto out;
58708 +
58709 +       bind_pirq.pirq  = irq;
58710 +       /* NB. We are happy to share unless we are probing. */
58711 +       bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
58712 +       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
58713 +               if (!probing_irq(irq))
58714 +                       printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
58715 +                              irq);
58716 +               return 0;
58717 +       }
58718 +       evtchn = bind_pirq.port;
58719 +
58720 +       pirq_query_unmask(irq_to_pirq(irq));
58721 +
58722 +       evtchn_to_irq[evtchn] = irq;
58723 +       bind_evtchn_to_cpu(evtchn, 0);
58724 +       irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
58725 +
58726 + out:
58727 +       unmask_evtchn(evtchn);
58728 +       pirq_unmask_notify(irq_to_pirq(irq));
58729 +
58730 +       return 0;
58731 +}
58732 +
58733 +static void unmask_pirq_vector(unsigned int irq)
58734 +{
58735 +       int evtchn = evtchn_from_irq(irq);
58736 +
58737 +       if (VALID_EVTCHN(evtchn)) {
58738 +               unmask_evtchn(evtchn);
58739 +               pirq_unmask_notify(irq_to_pirq(irq));
58740 +       }
58741 +}
58742 +
58743 +static void mask_pirq_vector(unsigned int irq)
58744 +{
58745 +       int evtchn = evtchn_from_irq(irq);
58746 +
58747 +       if (VALID_EVTCHN(evtchn))
58748 +               mask_evtchn(evtchn);
58749 +}
58750 +
58751 +static void ack_pirq_vector(unsigned int irq)
58752 +{
58753 +       int evtchn = evtchn_from_irq(irq);
58754 +
58755 +       move_native_irq(irq);
58756 +
58757 +       if (VALID_EVTCHN(evtchn)) {
58758 +               mask_evtchn(evtchn);
58759 +               clear_evtchn(evtchn);
58760 +       }
58761 +}
58762 +
58763 +static void ack_pirq_quirk_vector(unsigned int irq)
58764 +{
58765 +       int evtchn = evtchn_from_irq(irq);
58766 +
58767 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) {
58768 +               unmask_evtchn(evtchn);
58769 +               pirq_unmask_notify(irq_to_pirq(irq));
58770 +       }
58771 +}
58772 +
58773 +static struct  irq_chip pirq_chip = {
58774 +       .name           = "Phys-irq",
58775 +       .startup        = startup_pirq_vector,
58776 +       .mask           = mask_pirq_vector,
58777 +       .unmask         = unmask_pirq_vector,
58778 +       .ack            = ack_pirq_vector,
58779 +       .eoi            = ack_pirq_quirk_vector,
58780 +#ifdef CONFIG_SMP
58781 +       .set_affinity   = set_affinity_irq,
58782 +#endif
58783 +       .retrigger      = retrigger_vector,
58784 +};
58785 +
58786 +int irq_ignore_unhandled(unsigned int irq)
58787 +{
58788 +       struct physdev_irq_status_query irq_status = { .irq = irq };
58789 +
58790 +       if (!is_running_on_xen())
58791 +               return 0;
58792 +
58793 +       (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
58794 +       return !!(irq_status.flags & XENIRQSTAT_shared);
58795 +}
58796 +
58797 +void resend_irq_on_evtchn(unsigned int i)
58798 +{
58799 +       int evtchn = evtchn_from_irq(i);
58800 +       shared_info_t *s = HYPERVISOR_shared_info;
58801 +       if (!VALID_EVTCHN(evtchn))
58802 +               return;
58803 +       BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
58804 +       synch_set_bit(evtchn, &s->evtchn_pending[0]);
58805 +}
58806 +
58807 +void notify_remote_via_irq(int irq)
58808 +{
58809 +       int evtchn = evtchn_from_irq(irq);
58810 +
58811 +       if (VALID_EVTCHN(evtchn))
58812 +               notify_remote_via_evtchn(evtchn);
58813 +}
58814 +EXPORT_SYMBOL_GPL(notify_remote_via_irq);
58815 +
58816 +void mask_evtchn(int port)
58817 +{
58818 +       shared_info_t *s = HYPERVISOR_shared_info;
58819 +       synch_set_bit(port, &s->evtchn_mask[0]);
58820 +}
58821 +EXPORT_SYMBOL_GPL(mask_evtchn);
58822 +
58823 +void unmask_evtchn(int port)
58824 +{
58825 +       shared_info_t *s = HYPERVISOR_shared_info;
58826 +       unsigned int cpu = smp_processor_id();
58827 +       vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
58828 +
58829 +       BUG_ON(!irqs_disabled());
58830 +
58831 +       /* Slow path (hypercall) if this is a non-local port. */
58832 +       if (unlikely(cpu != cpu_from_evtchn(port))) {
58833 +               struct evtchn_unmask unmask = { .port = port };
58834 +               (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
58835 +               return;
58836 +       }
58837 +
58838 +       synch_clear_bit(port, &s->evtchn_mask[0]);
58839 +
58840 +       /*
58841 +        * The following is basically the equivalent of 'hw_resend_irq'. Just
58842 +        * like a real IO-APIC we 'lose the interrupt edge' if the channel is
58843 +        * masked.
58844 +        */
58845 +       if (synch_test_bit(port, &s->evtchn_pending[0]) &&
58846 +           !synch_test_and_set_bit(port / BITS_PER_LONG,
58847 +                                   &vcpu_info->evtchn_pending_sel))
58848 +               vcpu_info->evtchn_upcall_pending = 1;
58849 +}
58850 +EXPORT_SYMBOL_GPL(unmask_evtchn);
58851 +
58852 +void irq_resume(void)
58853 +{
58854 +       struct evtchn_bind_virq bind_virq;
58855 +       struct evtchn_bind_ipi  bind_ipi;
58856 +       int cpu, pirq, virq, ipi, irq, evtchn;
58857 +
58858 +       init_evtchn_cpu_bindings();
58859 +
58860 +       /* New event-channel space is not 'live' yet. */
58861 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
58862 +               mask_evtchn(evtchn);
58863 +
58864 +       /* Check that no PIRQs are still bound. */
58865 +       for (pirq = 0; pirq < NR_PIRQS; pirq++)
58866 +               BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
58867 +
58868 +       /* Secondary CPUs must have no VIRQ or IPI bindings. */
58869 +       for_each_possible_cpu(cpu) {
58870 +               if (cpu == 0)
58871 +                       continue;
58872 +               for (virq = 0; virq < NR_VIRQS; virq++)
58873 +                       BUG_ON(per_cpu(virq_to_irq, cpu)[virq] != -1);
58874 +               for (ipi = 0; ipi < NR_IPIS; ipi++)
58875 +                       BUG_ON(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
58876 +       }
58877 +
58878 +       /* No IRQ <-> event-channel mappings. */
58879 +       for (irq = 0; irq < NR_IRQS; irq++)
58880 +               irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
58881 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
58882 +               evtchn_to_irq[evtchn] = -1;
58883 +
58884 +       /* Primary CPU: rebind VIRQs automatically. */
58885 +       for (virq = 0; virq < NR_VIRQS; virq++) {
58886 +               if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
58887 +                       continue;
58888 +
58889 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
58890 +
58891 +               /* Get a new binding from Xen. */
58892 +               bind_virq.virq = virq;
58893 +               bind_virq.vcpu = 0;
58894 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
58895 +                                               &bind_virq) != 0)
58896 +                       BUG();
58897 +               evtchn = bind_virq.port;
58898 +
58899 +               /* Record the new mapping. */
58900 +               evtchn_to_irq[evtchn] = irq;
58901 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
58902 +
58903 +               /* Ready for use. */
58904 +               unmask_evtchn(evtchn);
58905 +       }
58906 +
58907 +       /* Primary CPU: rebind IPIs automatically. */
58908 +       for (ipi = 0; ipi < NR_IPIS; ipi++) {
58909 +               if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
58910 +                       continue;
58911 +
58912 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
58913 +
58914 +               /* Get a new binding from Xen. */
58915 +               bind_ipi.vcpu = 0;
58916 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
58917 +                                               &bind_ipi) != 0)
58918 +                       BUG();
58919 +               evtchn = bind_ipi.port;
58920 +
58921 +               /* Record the new mapping. */
58922 +               evtchn_to_irq[evtchn] = irq;
58923 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
58924 +
58925 +               /* Ready for use. */
58926 +               unmask_evtchn(evtchn);
58927 +       }
58928 +}
58929 +
58930 +void __init xen_init_IRQ(void)
58931 +{
58932 +       int i;
58933 +
58934 +       init_evtchn_cpu_bindings();
58935 +
58936 +       /* No event channels are 'live' right now. */
58937 +       for (i = 0; i < NR_EVENT_CHANNELS; i++)
58938 +               mask_evtchn(i);
58939 +
58940 +       /* No IRQ -> event-channel mappings. */
58941 +       for (i = 0; i < NR_IRQS; i++)
58942 +               irq_info[i] = IRQ_UNBOUND;
58943 +
58944 +       /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
58945 +       for (i = 0; i < NR_DYNIRQS; i++) {
58946 +               irq_bindcount[dynirq_to_irq(i)] = 0;
58947 +
58948 +               irq_desc[dynirq_to_irq(i)].status  = IRQ_DISABLED;
58949 +               irq_desc[dynirq_to_irq(i)].action  = NULL;
58950 +               irq_desc[dynirq_to_irq(i)].depth   = 1;
58951 +               set_irq_chip_and_handler_name(dynirq_to_irq(i), &dynirq_chip,
58952 +                                             handle_level_irq, "level");
58953 +       }
58954 +
58955 +       /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
58956 +       for (i = 0; i < NR_PIRQS; i++) {
58957 +               irq_bindcount[pirq_to_irq(i)] = 1;
58958 +
58959 +#ifdef RTC_IRQ
58960 +               /* If not domain 0, force our RTC driver to fail its probe. */
58961 +               if ((i == RTC_IRQ) && !is_initial_xendomain())
58962 +                       continue;
58963 +#endif
58964 +
58965 +               irq_desc[pirq_to_irq(i)].status  = IRQ_DISABLED;
58966 +               irq_desc[pirq_to_irq(i)].action  = NULL;
58967 +               irq_desc[pirq_to_irq(i)].depth   = 1;
58968 +               set_irq_chip_and_handler_name(pirq_to_irq(i), &pirq_chip,
58969 +                                             handle_level_irq, "level");
58970 +       }
58971 +}
58972 diff -ruNp linux-2.6.19/drivers/xen/core/features.c linux-2.6.19-xen-3.0.4/drivers/xen/core/features.c
58973 --- linux-2.6.19/drivers/xen/core/features.c    1970-01-01 00:00:00.000000000 +0000
58974 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/features.c  2007-02-02 19:10:45.000000000 +0000
58975 @@ -0,0 +1,34 @@
58976 +/******************************************************************************
58977 + * features.c
58978 + *
58979 + * Xen feature flags.
58980 + *
58981 + * Copyright (c) 2006, Ian Campbell, XenSource Inc.
58982 + */
58983 +#include <linux/types.h>
58984 +#include <linux/cache.h>
58985 +#include <linux/module.h>
58986 +#include <asm/hypervisor.h>
58987 +#include <xen/features.h>
58988 +
58989 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
58990 +#include <xen/platform-compat.h>
58991 +#endif
58992 +
58993 +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
58994 +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
58995 +EXPORT_SYMBOL(xen_features);
58996 +
58997 +void setup_xen_features(void)
58998 +{
58999 +       xen_feature_info_t fi;
59000 +       int i, j;
59001 +
59002 +       for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
59003 +               fi.submap_idx = i;
59004 +               if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
59005 +                       break;
59006 +               for (j=0; j<32; j++)
59007 +                       xen_features[i*32+j] = !!(fi.submap & 1<<j);
59008 +       }
59009 +}
59010 diff -ruNp linux-2.6.19/drivers/xen/core/gnttab.c linux-2.6.19-xen-3.0.4/drivers/xen/core/gnttab.c
59011 --- linux-2.6.19/drivers/xen/core/gnttab.c      1970-01-01 00:00:00.000000000 +0000
59012 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/gnttab.c    2007-02-02 19:10:45.000000000 +0000
59013 @@ -0,0 +1,487 @@
59014 +/******************************************************************************
59015 + * gnttab.c
59016 + *
59017 + * Granting foreign access to our memory reservation.
59018 + *
59019 + * Copyright (c) 2005, Christopher Clark
59020 + * Copyright (c) 2004-2005, K A Fraser
59021 + *
59022 + * This program is free software; you can redistribute it and/or
59023 + * modify it under the terms of the GNU General Public License version 2
59024 + * as published by the Free Software Foundation; or, when distributed
59025 + * separately from the Linux kernel or incorporated into other
59026 + * software packages, subject to the following license:
59027 + *
59028 + * Permission is hereby granted, free of charge, to any person obtaining a copy
59029 + * of this source file (the "Software"), to deal in the Software without
59030 + * restriction, including without limitation the rights to use, copy, modify,
59031 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
59032 + * and to permit persons to whom the Software is furnished to do so, subject to
59033 + * the following conditions:
59034 + *
59035 + * The above copyright notice and this permission notice shall be included in
59036 + * all copies or substantial portions of the Software.
59037 + *
59038 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
59039 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59040 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
59041 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
59042 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
59043 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
59044 + * IN THE SOFTWARE.
59045 + */
59046 +
59047 +#include <linux/module.h>
59048 +#include <linux/sched.h>
59049 +#include <linux/mm.h>
59050 +#include <linux/vmalloc.h>
59051 +#include <xen/interface/xen.h>
59052 +#include <xen/gnttab.h>
59053 +#include <asm/pgtable.h>
59054 +#include <asm/uaccess.h>
59055 +#include <asm/synch_bitops.h>
59056 +#include <asm/io.h>
59057 +#include <xen/interface/memory.h>
59058 +
59059 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
59060 +#include <xen/platform-compat.h>
59061 +#endif
59062 +
59063 +/* External tools reserve first few grant table entries. */
59064 +#define NR_RESERVED_ENTRIES 8
59065 +
59066 +#define NR_GRANT_ENTRIES \
59067 +       (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(struct grant_entry))
59068 +#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1)
59069 +
59070 +static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
59071 +static int gnttab_free_count;
59072 +static grant_ref_t gnttab_free_head;
59073 +static DEFINE_SPINLOCK(gnttab_list_lock);
59074 +
59075 +static struct grant_entry *shared;
59076 +
59077 +static struct gnttab_free_callback *gnttab_free_callback_list;
59078 +
59079 +static int get_free_entries(int count)
59080 +{
59081 +       unsigned long flags;
59082 +       int ref;
59083 +       grant_ref_t head;
59084 +       spin_lock_irqsave(&gnttab_list_lock, flags);
59085 +       if (gnttab_free_count < count) {
59086 +               spin_unlock_irqrestore(&gnttab_list_lock, flags);
59087 +               return -1;
59088 +       }
59089 +       ref = head = gnttab_free_head;
59090 +       gnttab_free_count -= count;
59091 +       while (count-- > 1)
59092 +               head = gnttab_list[head];
59093 +       gnttab_free_head = gnttab_list[head];
59094 +       gnttab_list[head] = GNTTAB_LIST_END;
59095 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
59096 +       return ref;
59097 +}
59098 +
59099 +#define get_free_entry() get_free_entries(1)
59100 +
59101 +static void do_free_callbacks(void)
59102 +{
59103 +       struct gnttab_free_callback *callback, *next;
59104 +
59105 +       callback = gnttab_free_callback_list;
59106 +       gnttab_free_callback_list = NULL;
59107 +
59108 +       while (callback != NULL) {
59109 +               next = callback->next;
59110 +               if (gnttab_free_count >= callback->count) {
59111 +                       callback->next = NULL;
59112 +                       callback->fn(callback->arg);
59113 +               } else {
59114 +                       callback->next = gnttab_free_callback_list;
59115 +                       gnttab_free_callback_list = callback;
59116 +               }
59117 +               callback = next;
59118 +       }
59119 +}
59120 +
59121 +static inline void check_free_callbacks(void)
59122 +{
59123 +       if (unlikely(gnttab_free_callback_list))
59124 +               do_free_callbacks();
59125 +}
59126 +
59127 +static void put_free_entry(grant_ref_t ref)
59128 +{
59129 +       unsigned long flags;
59130 +       spin_lock_irqsave(&gnttab_list_lock, flags);
59131 +       gnttab_list[ref] = gnttab_free_head;
59132 +       gnttab_free_head = ref;
59133 +       gnttab_free_count++;
59134 +       check_free_callbacks();
59135 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
59136 +}
59137 +
59138 +/*
59139 + * Public grant-issuing interface functions
59140 + */
59141 +
59142 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
59143 +                               int readonly)
59144 +{
59145 +       int ref;
59146 +
59147 +       if (unlikely((ref = get_free_entry()) == -1))
59148 +               return -ENOSPC;
59149 +
59150 +       shared[ref].frame = frame;
59151 +       shared[ref].domid = domid;
59152 +       wmb();
59153 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
59154 +
59155 +       return ref;
59156 +}
59157 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
59158 +
59159 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
59160 +                                    unsigned long frame, int readonly)
59161 +{
59162 +       shared[ref].frame = frame;
59163 +       shared[ref].domid = domid;
59164 +       wmb();
59165 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
59166 +}
59167 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
59168 +
59169 +
59170 +int gnttab_query_foreign_access(grant_ref_t ref)
59171 +{
59172 +       u16 nflags;
59173 +
59174 +       nflags = shared[ref].flags;
59175 +
59176 +       return (nflags & (GTF_reading|GTF_writing));
59177 +}
59178 +EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
59179 +
59180 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
59181 +{
59182 +       u16 flags, nflags;
59183 +
59184 +       nflags = shared[ref].flags;
59185 +       do {
59186 +               if ((flags = nflags) & (GTF_reading|GTF_writing)) {
59187 +                       printk(KERN_ALERT "WARNING: g.e. still in use!\n");
59188 +                       return 0;
59189 +               }
59190 +       } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) !=
59191 +                flags);
59192 +
59193 +       return 1;
59194 +}
59195 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
59196 +
59197 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
59198 +                              unsigned long page)
59199 +{
59200 +       if (gnttab_end_foreign_access_ref(ref, readonly)) {
59201 +               put_free_entry(ref);
59202 +               if (page != 0)
59203 +                       free_page(page);
59204 +       } else {
59205 +               /* XXX This needs to be fixed so that the ref and page are
59206 +                  placed on a list to be freed up later. */
59207 +               printk(KERN_WARNING
59208 +                      "WARNING: leaking g.e. and page still in use!\n");
59209 +       }
59210 +}
59211 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
59212 +
59213 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
59214 +{
59215 +       int ref;
59216 +
59217 +       if (unlikely((ref = get_free_entry()) == -1))
59218 +               return -ENOSPC;
59219 +       gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
59220 +
59221 +       return ref;
59222 +}
59223 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
59224 +
59225 +void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
59226 +                                      unsigned long pfn)
59227 +{
59228 +       shared[ref].frame = pfn;
59229 +       shared[ref].domid = domid;
59230 +       wmb();
59231 +       shared[ref].flags = GTF_accept_transfer;
59232 +}
59233 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
59234 +
59235 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
59236 +{
59237 +       unsigned long frame;
59238 +       u16           flags;
59239 +
59240 +       /*
59241 +        * If a transfer is not even yet started, try to reclaim the grant
59242 +        * reference and return failure (== 0).
59243 +        */
59244 +       while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
59245 +               if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags)
59246 +                       return 0;
59247 +               cpu_relax();
59248 +       }
59249 +
59250 +       /* If a transfer is in progress then wait until it is completed. */
59251 +       while (!(flags & GTF_transfer_completed)) {
59252 +               flags = shared[ref].flags;
59253 +               cpu_relax();
59254 +       }
59255 +
59256 +       /* Read the frame number /after/ reading completion status. */
59257 +       rmb();
59258 +       frame = shared[ref].frame;
59259 +       BUG_ON(frame == 0);
59260 +
59261 +       return frame;
59262 +}
59263 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
59264 +
59265 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
59266 +{
59267 +       unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
59268 +       put_free_entry(ref);
59269 +       return frame;
59270 +}
59271 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
59272 +
59273 +void gnttab_free_grant_reference(grant_ref_t ref)
59274 +{
59275 +       put_free_entry(ref);
59276 +}
59277 +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
59278 +
59279 +void gnttab_free_grant_references(grant_ref_t head)
59280 +{
59281 +       grant_ref_t ref;
59282 +       unsigned long flags;
59283 +       int count = 1;
59284 +       if (head == GNTTAB_LIST_END)
59285 +               return;
59286 +       spin_lock_irqsave(&gnttab_list_lock, flags);
59287 +       ref = head;
59288 +       while (gnttab_list[ref] != GNTTAB_LIST_END) {
59289 +               ref = gnttab_list[ref];
59290 +               count++;
59291 +       }
59292 +       gnttab_list[ref] = gnttab_free_head;
59293 +       gnttab_free_head = head;
59294 +       gnttab_free_count += count;
59295 +       check_free_callbacks();
59296 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
59297 +}
59298 +EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
59299 +
59300 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
59301 +{
59302 +       int h = get_free_entries(count);
59303 +
59304 +       if (h == -1)
59305 +               return -ENOSPC;
59306 +
59307 +       *head = h;
59308 +
59309 +       return 0;
59310 +}
59311 +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
59312 +
59313 +int gnttab_empty_grant_references(const grant_ref_t *private_head)
59314 +{
59315 +       return (*private_head == GNTTAB_LIST_END);
59316 +}
59317 +EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
59318 +
59319 +int gnttab_claim_grant_reference(grant_ref_t *private_head)
59320 +{
59321 +       grant_ref_t g = *private_head;
59322 +       if (unlikely(g == GNTTAB_LIST_END))
59323 +               return -ENOSPC;
59324 +       *private_head = gnttab_list[g];
59325 +       return g;
59326 +}
59327 +EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
59328 +
59329 +void gnttab_release_grant_reference(grant_ref_t *private_head,
59330 +                                   grant_ref_t release)
59331 +{
59332 +       gnttab_list[release] = *private_head;
59333 +       *private_head = release;
59334 +}
59335 +EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
59336 +
59337 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
59338 +                                 void (*fn)(void *), void *arg, u16 count)
59339 +{
59340 +       unsigned long flags;
59341 +       spin_lock_irqsave(&gnttab_list_lock, flags);
59342 +       if (callback->next)
59343 +               goto out;
59344 +       callback->fn = fn;
59345 +       callback->arg = arg;
59346 +       callback->count = count;
59347 +       callback->next = gnttab_free_callback_list;
59348 +       gnttab_free_callback_list = callback;
59349 +       check_free_callbacks();
59350 +out:
59351 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
59352 +}
59353 +EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
59354 +
59355 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
59356 +{
59357 +       struct gnttab_free_callback **pcb;
59358 +       unsigned long flags;
59359 +
59360 +       spin_lock_irqsave(&gnttab_list_lock, flags);
59361 +       for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
59362 +               if (*pcb == callback) {
59363 +                       *pcb = callback->next;
59364 +                       break;
59365 +               }
59366 +       }
59367 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
59368 +}
59369 +EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
59370 +
59371 +#ifdef CONFIG_XEN
59372 +
59373 +#ifndef __ia64__
59374 +static int map_pte_fn(pte_t *pte, struct page *pmd_page,
59375 +                     unsigned long addr, void *data)
59376 +{
59377 +       unsigned long **frames = (unsigned long **)data;
59378 +
59379 +       set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
59380 +       (*frames)++;
59381 +       return 0;
59382 +}
59383 +
59384 +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
59385 +                       unsigned long addr, void *data)
59386 +{
59387 +
59388 +       set_pte_at(&init_mm, addr, pte, __pte(0));
59389 +       return 0;
59390 +}
59391 +#endif
59392 +
59393 +int gnttab_resume(void)
59394 +{
59395 +       struct gnttab_setup_table setup;
59396 +       unsigned long frames[NR_GRANT_FRAMES];
59397 +       int rc;
59398 +#ifndef __ia64__
59399 +       void *pframes = frames;
59400 +       struct vm_struct *area;
59401 +#endif
59402 +
59403 +       setup.dom        = DOMID_SELF;
59404 +       setup.nr_frames  = NR_GRANT_FRAMES;
59405 +       set_xen_guest_handle(setup.frame_list, frames);
59406 +
59407 +       rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
59408 +       if (rc == -ENOSYS)
59409 +               return -ENOSYS;
59410 +
59411 +       BUG_ON(rc || setup.status);
59412 +
59413 +#ifndef __ia64__
59414 +       if (shared == NULL) {
59415 +               area = get_vm_area(PAGE_SIZE * NR_GRANT_FRAMES, VM_IOREMAP);
59416 +               BUG_ON(area == NULL);
59417 +               shared = area->addr;
59418 +       }
59419 +       rc = apply_to_page_range(&init_mm, (unsigned long)shared,
59420 +                                PAGE_SIZE * NR_GRANT_FRAMES,
59421 +                                map_pte_fn, &pframes);
59422 +       BUG_ON(rc);
59423 +#else
59424 +       shared = __va(frames[0] << PAGE_SHIFT);
59425 +       printk("grant table at %p\n", shared);
59426 +#endif
59427 +
59428 +       return 0;
59429 +}
59430 +
59431 +int gnttab_suspend(void)
59432 +{
59433 +#ifndef __ia64__
59434 +       apply_to_page_range(&init_mm, (unsigned long)shared,
59435 +                           PAGE_SIZE * NR_GRANT_FRAMES,
59436 +                           unmap_pte_fn, NULL);
59437 +#endif
59438 +       return 0;
59439 +}
59440 +
59441 +#else /* !CONFIG_XEN */
59442 +
59443 +#include <platform-pci.h>
59444 +
59445 +int gnttab_resume(void)
59446 +{
59447 +       unsigned long frames;
59448 +       struct xen_add_to_physmap xatp;
59449 +       unsigned int i;
59450 +
59451 +       frames = alloc_xen_mmio(PAGE_SIZE * NR_GRANT_FRAMES);
59452 +
59453 +       for (i = 0; i < NR_GRANT_FRAMES; i++) {
59454 +               xatp.domid = DOMID_SELF;
59455 +               xatp.idx = i;
59456 +               xatp.space = XENMAPSPACE_grant_table;
59457 +               xatp.gpfn = (frames >> PAGE_SHIFT) + i;
59458 +               if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
59459 +                       BUG();
59460 +       }
59461 +
59462 +       shared = ioremap(frames, PAGE_SIZE * NR_GRANT_FRAMES);
59463 +       if (shared == NULL) {
59464 +               printk("error to ioremap gnttab share frames\n");
59465 +               return -1;
59466 +       }
59467 +
59468 +       return 0;
59469 +}
59470 +
59471 +int gnttab_suspend(void)
59472 +{
59473 +       iounmap(shared);
59474 +       return 0;
59475 +}
59476 +
59477 +#endif /* !CONFIG_XEN */
59478 +
59479 +int __init gnttab_init(void)
59480 +{
59481 +       int i;
59482 +
59483 +       if (!is_running_on_xen())
59484 +               return -ENODEV;
59485 +
59486 +       if (gnttab_resume() < 0)
59487 +               return -ENODEV;
59488 +
59489 +       for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
59490 +               gnttab_list[i] = i + 1;
59491 +       gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES;
59492 +       gnttab_free_head  = NR_RESERVED_ENTRIES;
59493 +
59494 +       printk("Grant table initialized\n");
59495 +       return 0;
59496 +}
59497 +
59498 +#ifdef CONFIG_XEN
59499 +core_initcall(gnttab_init);
59500 +#endif
59501 diff -ruNp linux-2.6.19/drivers/xen/core/hypervisor_sysfs.c linux-2.6.19-xen-3.0.4/drivers/xen/core/hypervisor_sysfs.c
59502 --- linux-2.6.19/drivers/xen/core/hypervisor_sysfs.c    1970-01-01 00:00:00.000000000 +0000
59503 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/hypervisor_sysfs.c  2007-02-02 19:10:45.000000000 +0000
59504 @@ -0,0 +1,59 @@
59505 +/*
59506 + *  copyright (c) 2006 IBM Corporation
59507 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
59508 + *
59509 + *  This program is free software; you can redistribute it and/or modify
59510 + *  it under the terms of the GNU General Public License version 2 as
59511 + *  published by the Free Software Foundation.
59512 + */
59513 +
59514 +#include <linux/kernel.h>
59515 +#include <linux/module.h>
59516 +#include <linux/kobject.h>
59517 +#include <xen/hypervisor_sysfs.h>
59518 +
59519 +decl_subsys(hypervisor, NULL, NULL);
59520 +
59521 +static ssize_t hyp_sysfs_show(struct kobject *kobj,
59522 +                             struct attribute *attr,
59523 +                             char *buffer)
59524 +{
59525 +       struct hyp_sysfs_attr *hyp_attr;
59526 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
59527 +       if (hyp_attr->show)
59528 +               return hyp_attr->show(hyp_attr, buffer);
59529 +       return 0;
59530 +}
59531 +
59532 +static ssize_t hyp_sysfs_store(struct kobject *kobj,
59533 +                              struct attribute *attr,
59534 +                              const char *buffer,
59535 +                              size_t len)
59536 +{
59537 +       struct hyp_sysfs_attr *hyp_attr;
59538 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
59539 +       if (hyp_attr->store)
59540 +               return hyp_attr->store(hyp_attr, buffer, len);
59541 +       return 0;
59542 +}
59543 +
59544 +struct sysfs_ops hyp_sysfs_ops = {
59545 +       .show = hyp_sysfs_show,
59546 +       .store = hyp_sysfs_store,
59547 +};
59548 +
59549 +static struct kobj_type hyp_sysfs_kobj_type = {
59550 +       .sysfs_ops = &hyp_sysfs_ops,
59551 +};
59552 +
59553 +static int __init hypervisor_subsys_init(void)
59554 +{
59555 +       if (!is_running_on_xen())
59556 +               return -ENODEV;
59557 +
59558 +       hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
59559 +       return subsystem_register(&hypervisor_subsys);
59560 +}
59561 +
59562 +device_initcall(hypervisor_subsys_init);
59563 +EXPORT_SYMBOL_GPL(hypervisor_subsys);
59564 diff -ruNp linux-2.6.19/drivers/xen/core/machine_kexec.c linux-2.6.19-xen-3.0.4/drivers/xen/core/machine_kexec.c
59565 --- linux-2.6.19/drivers/xen/core/machine_kexec.c       1970-01-01 00:00:00.000000000 +0000
59566 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/machine_kexec.c     2007-02-02 19:10:45.000000000 +0000
59567 @@ -0,0 +1,190 @@
59568 +/*
59569 + * drivers/xen/core/machine_kexec.c 
59570 + * handle transition of Linux booting another kernel
59571 + */
59572 +
59573 +#include <linux/kexec.h>
59574 +#include <xen/interface/kexec.h>
59575 +#include <linux/mm.h>
59576 +#include <linux/bootmem.h>
59577 +#include <asm/hypercall.h>
59578 +
59579 +extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
59580 +                                        struct kimage *image);
59581 +
59582 +int xen_max_nr_phys_cpus;
59583 +struct resource xen_hypervisor_res;
59584 +struct resource *xen_phys_cpus;
59585 +
59586 +void xen_machine_kexec_setup_resources(void)
59587 +{
59588 +       xen_kexec_range_t range;
59589 +       struct resource *res;
59590 +       int k = 0;
59591 +
59592 +       if (!is_initial_xendomain())
59593 +               return;
59594 +
59595 +       /* determine maximum number of physical cpus */
59596 +
59597 +       while (1) {
59598 +               memset(&range, 0, sizeof(range));
59599 +               range.range = KEXEC_RANGE_MA_CPU;
59600 +               range.nr = k;
59601 +
59602 +               if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
59603 +                       break;
59604 +
59605 +               k++;
59606 +       }
59607 +
59608 +       if (k == 0)
59609 +               return;
59610 +
59611 +       xen_max_nr_phys_cpus = k;
59612 +
59613 +       /* allocate xen_phys_cpus */
59614 +
59615 +       xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
59616 +       BUG_ON(xen_phys_cpus == NULL);
59617 +
59618 +       /* fill in xen_phys_cpus with per-cpu crash note information */
59619 +
59620 +       for (k = 0; k < xen_max_nr_phys_cpus; k++) {
59621 +               memset(&range, 0, sizeof(range));
59622 +               range.range = KEXEC_RANGE_MA_CPU;
59623 +               range.nr = k;
59624 +
59625 +               if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
59626 +                       goto err;
59627 +
59628 +               res = xen_phys_cpus + k;
59629 +
59630 +               memset(res, 0, sizeof(*res));
59631 +               res->name = "Crash note";
59632 +               res->start = range.start;
59633 +               res->end = range.start + range.size - 1;
59634 +               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
59635 +       }
59636 +
59637 +       /* fill in xen_hypervisor_res with hypervisor machine address range */
59638 +
59639 +       memset(&range, 0, sizeof(range));
59640 +       range.range = KEXEC_RANGE_MA_XEN;
59641 +
59642 +       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
59643 +               goto err;
59644 +
59645 +       xen_hypervisor_res.name = "Hypervisor code and data";
59646 +       xen_hypervisor_res.start = range.start;
59647 +       xen_hypervisor_res.end = range.start + range.size - 1;
59648 +       xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
59649 +
59650 +       /* fill in crashk_res if range is reserved by hypervisor */
59651 +
59652 +       memset(&range, 0, sizeof(range));
59653 +       range.range = KEXEC_RANGE_MA_CRASH;
59654 +
59655 +       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
59656 +               return;
59657 +
59658 +       if (range.size) {
59659 +               crashk_res.start = range.start;
59660 +               crashk_res.end = range.start + range.size - 1;
59661 +       }
59662 +
59663 +       return;
59664 +
59665 + err:
59666 +       /*
59667 +        * It isn't possible to free xen_phys_cpus this early in the
59668 +        * boot. Since failure at this stage is unexpected and the
59669 +        * amount is small we leak the memory.
59670 +         */
59671 +       xen_max_nr_phys_cpus = 0;
59672 +       return;
59673 +}
59674 +
59675 +void xen_machine_kexec_register_resources(struct resource *res)
59676 +{
59677 +       int k;
59678 +
59679 +       request_resource(res, &xen_hypervisor_res);
59680 +
59681 +       for (k = 0; k < xen_max_nr_phys_cpus; k++)
59682 +               request_resource(&xen_hypervisor_res, xen_phys_cpus + k);
59683 +
59684 +}
59685 +
59686 +static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
59687 +{
59688 +       machine_kexec_setup_load_arg(xki, image);
59689 +
59690 +       xki->indirection_page = image->head;
59691 +       xki->start_address = image->start;
59692 +}
59693 +
59694 +/*
59695 + * Load the image into xen so xen can kdump itself
59696 + * This might have been done in prepare, but prepare
59697 + * is currently called too early. It might make sense
59698 + * to move prepare, but for now, just add an extra hook.
59699 + */
59700 +int xen_machine_kexec_load(struct kimage *image)
59701 +{
59702 +       xen_kexec_load_t xkl;
59703 +
59704 +       memset(&xkl, 0, sizeof(xkl));
59705 +       xkl.type = image->type;
59706 +       setup_load_arg(&xkl.image, image);
59707 +       return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
59708 +}
59709 +
59710 +/*
59711 + * Unload the image that was stored by machine_kexec_load()
59712 + * This might have been done in machine_kexec_cleanup() but it
59713 + * is called too late, and its possible xen could try and kdump
59714 + * using resources that have been freed.
59715 + */
59716 +void xen_machine_kexec_unload(struct kimage *image)
59717 +{
59718 +       xen_kexec_load_t xkl;
59719 +
59720 +       memset(&xkl, 0, sizeof(xkl));
59721 +       xkl.type = image->type;
59722 +       HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
59723 +}
59724 +
59725 +/*
59726 + * Do not allocate memory (or fail in any way) in machine_kexec().
59727 + * We are past the point of no return, committed to rebooting now.
59728 + *
59729 + * This has the hypervisor move to the prefered reboot CPU, 
59730 + * stop all CPUs and kexec. That is it combines machine_shutdown()
59731 + * and machine_kexec() in Linux kexec terms.
59732 + */
59733 +NORET_TYPE void machine_kexec(struct kimage *image)
59734 +{
59735 +       xen_kexec_exec_t xke;
59736 +
59737 +       memset(&xke, 0, sizeof(xke));
59738 +       xke.type = image->type;
59739 +       HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
59740 +       panic("KEXEC_CMD_kexec hypercall should not return\n");
59741 +}
59742 +
59743 +void machine_shutdown(void)
59744 +{
59745 +       /* do nothing */
59746 +}
59747 +
59748 +
59749 +/*
59750 + * Local variables:
59751 + *  c-file-style: "linux"
59752 + *  indent-tabs-mode: t
59753 + *  c-indent-level: 8
59754 + *  c-basic-offset: 8
59755 + *  tab-width: 8
59756 + * End:
59757 + */
59758 diff -ruNp linux-2.6.19/drivers/xen/core/machine_reboot.c linux-2.6.19-xen-3.0.4/drivers/xen/core/machine_reboot.c
59759 --- linux-2.6.19/drivers/xen/core/machine_reboot.c      1970-01-01 00:00:00.000000000 +0000
59760 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/machine_reboot.c    2007-02-02 19:10:45.000000000 +0000
59761 @@ -0,0 +1,185 @@
59762 +#define __KERNEL_SYSCALLS__
59763 +#include <linux/version.h>
59764 +#include <linux/kernel.h>
59765 +#include <linux/mm.h>
59766 +#include <linux/unistd.h>
59767 +#include <linux/module.h>
59768 +#include <linux/reboot.h>
59769 +#include <linux/sysrq.h>
59770 +#include <linux/stringify.h>
59771 +#include <asm/irq.h>
59772 +#include <asm/mmu_context.h>
59773 +#include <xen/evtchn.h>
59774 +#include <asm/hypervisor.h>
59775 +#include <xen/interface/dom0_ops.h>
59776 +#include <xen/xenbus.h>
59777 +#include <linux/cpu.h>
59778 +#include <linux/kthread.h>
59779 +#include <xen/gnttab.h>
59780 +#include <xen/xencons.h>
59781 +#include <xen/cpu_hotplug.h>
59782 +
59783 +#if defined(__i386__) || defined(__x86_64__)
59784 +
59785 +/*
59786 + * Power off function, if any
59787 + */
59788 +void (*pm_power_off)(void);
59789 +EXPORT_SYMBOL(pm_power_off);
59790 +
59791 +void machine_emergency_restart(void)
59792 +{
59793 +       /* We really want to get pending console data out before we die. */
59794 +       xencons_force_flush();
59795 +       HYPERVISOR_shutdown(SHUTDOWN_reboot);
59796 +}
59797 +
59798 +void machine_restart(char * __unused)
59799 +{
59800 +       machine_emergency_restart();
59801 +}
59802 +
59803 +void machine_halt(void)
59804 +{
59805 +       machine_power_off();
59806 +}
59807 +
59808 +void machine_power_off(void)
59809 +{
59810 +       /* We really want to get pending console data out before we die. */
59811 +       xencons_force_flush();
59812 +       if (pm_power_off)
59813 +               pm_power_off();
59814 +       HYPERVISOR_shutdown(SHUTDOWN_poweroff);
59815 +}
59816 +
59817 +int reboot_thru_bios = 0;      /* for dmi_scan.c */
59818 +EXPORT_SYMBOL(machine_restart);
59819 +EXPORT_SYMBOL(machine_halt);
59820 +EXPORT_SYMBOL(machine_power_off);
59821 +
59822 +/* Ensure we run on the idle task page tables so that we will
59823 +   switch page tables before running user space. This is needed
59824 +   on architectures with separate kernel and user page tables
59825 +   because the user page table pointer is not saved/restored. */
59826 +static void switch_idle_mm(void)
59827 +{
59828 +       struct mm_struct *mm = current->active_mm;
59829 +
59830 +       if (mm == &init_mm)
59831 +               return;
59832 +
59833 +       atomic_inc(&init_mm.mm_count);
59834 +       switch_mm(mm, &init_mm, current);
59835 +       current->active_mm = &init_mm;
59836 +       mmdrop(mm);
59837 +}
59838 +
59839 +static void pre_suspend(void)
59840 +{
59841 +       HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
59842 +       clear_fixmap(FIX_SHARED_INFO);
59843 +
59844 +       xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
59845 +       xen_start_info->console.domU.mfn =
59846 +               mfn_to_pfn(xen_start_info->console.domU.mfn);
59847 +}
59848 +
59849 +static void post_suspend(void)
59850 +{
59851 +       int i, j, k, fpp;
59852 +       extern unsigned long max_pfn;
59853 +       extern unsigned long *pfn_to_mfn_frame_list_list;
59854 +       extern unsigned long *pfn_to_mfn_frame_list[];
59855 +
59856 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
59857 +
59858 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
59859 +
59860 +       memset(empty_zero_page, 0, PAGE_SIZE);
59861 +
59862 +       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
59863 +               virt_to_mfn(pfn_to_mfn_frame_list_list);
59864 +
59865 +       fpp = PAGE_SIZE/sizeof(unsigned long);
59866 +       for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
59867 +               if ((j % fpp) == 0) {
59868 +                       k++;
59869 +                       pfn_to_mfn_frame_list_list[k] =
59870 +                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
59871 +                       j = 0;
59872 +               }
59873 +               pfn_to_mfn_frame_list[k][j] =
59874 +                       virt_to_mfn(&phys_to_machine_mapping[i]);
59875 +       }
59876 +       HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
59877 +}
59878 +
59879 +#else /* !(defined(__i386__) || defined(__x86_64__)) */
59880 +
59881 +#define switch_idle_mm()       ((void)0)
59882 +#define mm_pin_all()           ((void)0)
59883 +#define pre_suspend()          ((void)0)
59884 +#define post_suspend()         ((void)0)
59885 +
59886 +#endif
59887 +
59888 +int __xen_suspend(void)
59889 +{
59890 +       int err;
59891 +
59892 +       extern void time_resume(void);
59893 +
59894 +       BUG_ON(smp_processor_id() != 0);
59895 +       BUG_ON(in_interrupt());
59896 +
59897 +#if defined(__i386__) || defined(__x86_64__)
59898 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
59899 +               printk(KERN_WARNING "Cannot suspend in "
59900 +                      "auto_translated_physmap mode.\n");
59901 +               return -EOPNOTSUPP;
59902 +       }
59903 +#endif
59904 +
59905 +       err = smp_suspend();
59906 +       if (err)
59907 +               return err;
59908 +
59909 +       xenbus_suspend();
59910 +
59911 +       preempt_disable();
59912 +
59913 +       mm_pin_all();
59914 +       local_irq_disable();
59915 +       preempt_enable();
59916 +
59917 +       gnttab_suspend();
59918 +
59919 +       pre_suspend();
59920 +
59921 +       /*
59922 +        * We'll stop somewhere inside this hypercall. When it returns,
59923 +        * we'll start resuming after the restore.
59924 +        */
59925 +       HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
59926 +
59927 +       post_suspend();
59928 +
59929 +       gnttab_resume();
59930 +
59931 +       irq_resume();
59932 +
59933 +       time_resume();
59934 +
59935 +       switch_idle_mm();
59936 +
59937 +       local_irq_enable();
59938 +
59939 +       xencons_resume();
59940 +
59941 +       xenbus_resume();
59942 +
59943 +       smp_resume();
59944 +
59945 +       return err;
59946 +}
59947 diff -ruNp linux-2.6.19/drivers/xen/core/reboot.c linux-2.6.19-xen-3.0.4/drivers/xen/core/reboot.c
59948 --- linux-2.6.19/drivers/xen/core/reboot.c      1970-01-01 00:00:00.000000000 +0000
59949 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/reboot.c    2007-02-02 19:10:45.000000000 +0000
59950 @@ -0,0 +1,221 @@
59951 +#define __KERNEL_SYSCALLS__
59952 +#include <linux/version.h>
59953 +#include <linux/kernel.h>
59954 +#include <linux/unistd.h>
59955 +#include <linux/module.h>
59956 +#include <linux/reboot.h>
59957 +#include <linux/syscalls.h>
59958 +#include <linux/sysrq.h>
59959 +#include <asm/hypervisor.h>
59960 +#include <xen/xenbus.h>
59961 +#include <linux/kthread.h>
59962 +
59963 +MODULE_LICENSE("Dual BSD/GPL");
59964 +
59965 +#define SHUTDOWN_INVALID  -1
59966 +#define SHUTDOWN_POWEROFF  0
59967 +#define SHUTDOWN_SUSPEND   2
59968 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
59969 + * report a crash, not be instructed to crash!
59970 + * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
59971 + * the distinction when we return the reason code to them.
59972 + */
59973 +#define SHUTDOWN_HALT      4
59974 +
59975 +/* Ignore multiple shutdown requests. */
59976 +static int shutting_down = SHUTDOWN_INVALID;
59977 +
59978 +static void __shutdown_handler(void *unused);
59979 +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
59980 +
59981 +#ifdef CONFIG_XEN
59982 +int __xen_suspend(void);
59983 +#else
59984 +#define __xen_suspend() (void)0
59985 +#endif
59986 +
59987 +static int shutdown_process(void *__unused)
59988 +{
59989 +       static char *envp[] = { "HOME=/", "TERM=linux",
59990 +                               "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
59991 +       static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
59992 +
59993 +       extern asmlinkage long sys_reboot(int magic1, int magic2,
59994 +                                         unsigned int cmd, void *arg);
59995 +
59996 +       if ((shutting_down == SHUTDOWN_POWEROFF) ||
59997 +           (shutting_down == SHUTDOWN_HALT)) {
59998 +               if (kernel_execve("/sbin/poweroff", poweroff_argv, envp) < 0) {
59999 +#ifdef CONFIG_XEN
60000 +                       sys_reboot(LINUX_REBOOT_MAGIC1,
60001 +                                  LINUX_REBOOT_MAGIC2,
60002 +                                  LINUX_REBOOT_CMD_POWER_OFF,
60003 +                                  NULL);
60004 +#endif /* CONFIG_XEN */
60005 +               }
60006 +       }
60007 +
60008 +       shutting_down = SHUTDOWN_INVALID; /* could try again */
60009 +
60010 +       return 0;
60011 +}
60012 +
60013 +static int xen_suspend(void *__unused)
60014 +{
60015 +       __xen_suspend();
60016 +       shutting_down = SHUTDOWN_INVALID;
60017 +       return 0;
60018 +}
60019 +
60020 +static int kthread_create_on_cpu(int (*f)(void *arg),
60021 +                                void *arg,
60022 +                                const char *name,
60023 +                                int cpu)
60024 +{
60025 +       struct task_struct *p;
60026 +       p = kthread_create(f, arg, name);
60027 +       if (IS_ERR(p))
60028 +               return PTR_ERR(p);
60029 +       kthread_bind(p, cpu);
60030 +       wake_up_process(p);
60031 +       return 0;
60032 +}
60033 +
60034 +static void __shutdown_handler(void *unused)
60035 +{
60036 +       int err;
60037 +
60038 +       if (shutting_down != SHUTDOWN_SUSPEND)
60039 +               err = kernel_thread(shutdown_process, NULL,
60040 +                                   CLONE_FS | CLONE_FILES);
60041 +       else
60042 +               err = kthread_create_on_cpu(xen_suspend, NULL, "suspend", 0);
60043 +
60044 +       if (err < 0) {
60045 +               printk(KERN_WARNING "Error creating shutdown process (%d): "
60046 +                      "retrying...\n", -err);
60047 +               schedule_delayed_work(&shutdown_work, HZ/2);
60048 +       }
60049 +}
60050 +
60051 +static void shutdown_handler(struct xenbus_watch *watch,
60052 +                            const char **vec, unsigned int len)
60053 +{
60054 +       char *str;
60055 +       struct xenbus_transaction xbt;
60056 +       int err;
60057 +
60058 +       if (shutting_down != SHUTDOWN_INVALID)
60059 +               return;
60060 +
60061 + again:
60062 +       err = xenbus_transaction_start(&xbt);
60063 +       if (err)
60064 +               return;
60065 +       str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
60066 +       /* Ignore read errors and empty reads. */
60067 +       if (XENBUS_IS_ERR_READ(str)) {
60068 +               xenbus_transaction_end(xbt, 1);
60069 +               return;
60070 +       }
60071 +
60072 +       xenbus_write(xbt, "control", "shutdown", "");
60073 +
60074 +       err = xenbus_transaction_end(xbt, 0);
60075 +       if (err == -EAGAIN) {
60076 +               kfree(str);
60077 +               goto again;
60078 +       }
60079 +
60080 +       if (strcmp(str, "poweroff") == 0)
60081 +               shutting_down = SHUTDOWN_POWEROFF;
60082 +       else if (strcmp(str, "reboot") == 0)
60083 +               kill_proc(1, SIGINT, 1); /* interrupt init */
60084 +       else if (strcmp(str, "suspend") == 0)
60085 +               shutting_down = SHUTDOWN_SUSPEND;
60086 +       else if (strcmp(str, "halt") == 0)
60087 +               shutting_down = SHUTDOWN_HALT;
60088 +       else {
60089 +               printk("Ignoring shutdown request: %s\n", str);
60090 +               shutting_down = SHUTDOWN_INVALID;
60091 +       }
60092 +
60093 +       if (shutting_down != SHUTDOWN_INVALID)
60094 +               schedule_work(&shutdown_work);
60095 +
60096 +       kfree(str);
60097 +}
60098 +
60099 +static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
60100 +                         unsigned int len)
60101 +{
60102 +       char sysrq_key = '\0';
60103 +       struct xenbus_transaction xbt;
60104 +       int err;
60105 +
60106 + again:
60107 +       err = xenbus_transaction_start(&xbt);
60108 +       if (err)
60109 +               return;
60110 +       if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
60111 +               printk(KERN_ERR "Unable to read sysrq code in "
60112 +                      "control/sysrq\n");
60113 +               xenbus_transaction_end(xbt, 1);
60114 +               return;
60115 +       }
60116 +
60117 +       if (sysrq_key != '\0')
60118 +               xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
60119 +
60120 +       err = xenbus_transaction_end(xbt, 0);
60121 +       if (err == -EAGAIN)
60122 +               goto again;
60123 +
60124 +#ifdef CONFIG_MAGIC_SYSRQ
60125 +       if (sysrq_key != '\0')
60126 +               handle_sysrq(sysrq_key, NULL);
60127 +#endif
60128 +}
60129 +
60130 +static struct xenbus_watch shutdown_watch = {
60131 +       .node = "control/shutdown",
60132 +       .callback = shutdown_handler
60133 +};
60134 +
60135 +static struct xenbus_watch sysrq_watch = {
60136 +       .node ="control/sysrq",
60137 +       .callback = sysrq_handler
60138 +};
60139 +
60140 +static int setup_shutdown_watcher(struct notifier_block *notifier,
60141 +                                 unsigned long event,
60142 +                                 void *data)
60143 +{
60144 +       int err;
60145 +
60146 +       err = register_xenbus_watch(&shutdown_watch);
60147 +       if (err)
60148 +               printk(KERN_ERR "Failed to set shutdown watcher\n");
60149 +       else
60150 +               xenbus_write(XBT_NIL, "control", "feature-reboot", "1");
60151 +
60152 +       err = register_xenbus_watch(&sysrq_watch);
60153 +       if (err)
60154 +               printk(KERN_ERR "Failed to set sysrq watcher\n");
60155 +       else
60156 +               xenbus_write(XBT_NIL, "control", "feature-sysrq", "1");
60157 +
60158 +       return NOTIFY_DONE;
60159 +}
60160 +
60161 +static int __init setup_shutdown_event(void)
60162 +{
60163 +       static struct notifier_block xenstore_notifier = {
60164 +               .notifier_call = setup_shutdown_watcher
60165 +       };
60166 +       register_xenstore_notifier(&xenstore_notifier);
60167 +
60168 +       return 0;
60169 +}
60170 +
60171 +subsys_initcall(setup_shutdown_event);
60172 diff -ruNp linux-2.6.19/drivers/xen/core/skbuff.c linux-2.6.19-xen-3.0.4/drivers/xen/core/skbuff.c
60173 --- linux-2.6.19/drivers/xen/core/skbuff.c      1970-01-01 00:00:00.000000000 +0000
60174 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/skbuff.c    2007-02-02 19:10:45.000000000 +0000
60175 @@ -0,0 +1,144 @@
60176 +
60177 +#include <linux/module.h>
60178 +#include <linux/version.h>
60179 +#include <linux/kernel.h>
60180 +#include <linux/sched.h>
60181 +#include <linux/slab.h>
60182 +#include <linux/netdevice.h>
60183 +#include <linux/inetdevice.h>
60184 +#include <linux/etherdevice.h>
60185 +#include <linux/skbuff.h>
60186 +#include <linux/init.h>
60187 +#include <asm/io.h>
60188 +#include <asm/page.h>
60189 +#include <asm/hypervisor.h>
60190 +
60191 +/* Referenced in netback.c. */
60192 +/*static*/ kmem_cache_t *skbuff_cachep;
60193 +EXPORT_SYMBOL(skbuff_cachep);
60194 +
60195 +/* Allow up to 64kB or page-sized packets (whichever is greater). */
60196 +#if PAGE_SHIFT < 16
60197 +#define MAX_SKBUFF_ORDER (16 - PAGE_SHIFT)
60198 +#else
60199 +#define MAX_SKBUFF_ORDER 0
60200 +#endif
60201 +static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
60202 +
60203 +static struct {
60204 +       int size;
60205 +       kmem_cache_t *cachep;
60206 +} skbuff_small[] = { { 512, NULL }, { 2048, NULL } };
60207 +
60208 +struct sk_buff *__alloc_skb(unsigned int length, gfp_t gfp_mask,
60209 +                           int fclone)
60210 +{
60211 +       int order, i;
60212 +       kmem_cache_t *cachep;
60213 +
60214 +       length = SKB_DATA_ALIGN(length) + sizeof(struct skb_shared_info);
60215 +
60216 +       if (length <= skbuff_small[ARRAY_SIZE(skbuff_small)-1].size) {
60217 +               for (i = 0; skbuff_small[i].size < length; i++)
60218 +                       continue;
60219 +               cachep = skbuff_small[i].cachep;
60220 +       } else {
60221 +               order = get_order(length);
60222 +               if (order > MAX_SKBUFF_ORDER) {
60223 +                       printk(KERN_ALERT "Attempt to allocate order %d "
60224 +                              "skbuff. Increase MAX_SKBUFF_ORDER.\n", order);
60225 +                       return NULL;
60226 +               }
60227 +               cachep = skbuff_order_cachep[order];
60228 +       }
60229 +
60230 +       length -= sizeof(struct skb_shared_info);
60231 +
60232 +       return alloc_skb_from_cache(cachep, length, gfp_mask, fclone);
60233 +}
60234 +
60235 +struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask)
60236 +{
60237 +       struct sk_buff *skb;
60238 +       int order;
60239 +
60240 +       length = SKB_DATA_ALIGN(length + 16);
60241 +       order = get_order(length + sizeof(struct skb_shared_info));
60242 +       if (order > MAX_SKBUFF_ORDER) {
60243 +               printk(KERN_ALERT "Attempt to allocate order %d skbuff. "
60244 +                      "Increase MAX_SKBUFF_ORDER.\n", order);
60245 +               return NULL;
60246 +       }
60247 +
60248 +       skb = alloc_skb_from_cache(
60249 +               skbuff_order_cachep[order], length, gfp_mask, 0);
60250 +       if (skb != NULL)
60251 +               skb_reserve(skb, 16);
60252 +
60253 +       return skb;
60254 +}
60255 +
60256 +static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused)
60257 +{
60258 +       int order = 0;
60259 +
60260 +       while (skbuff_order_cachep[order] != cachep)
60261 +               order++;
60262 +
60263 +       /* Do our best to allocate contiguous memory but fall back to IOMMU. */
60264 +       if (order != 0)
60265 +               (void)xen_create_contiguous_region(
60266 +                       (unsigned long)buf, order, 0);
60267 +
60268 +       scrub_pages(buf, 1 << order);
60269 +}
60270 +
60271 +static void skbuff_dtor(void *buf, kmem_cache_t *cachep, unsigned long unused)
60272 +{
60273 +       int order = 0;
60274 +
60275 +       while (skbuff_order_cachep[order] != cachep)
60276 +               order++;
60277 +
60278 +       if (order != 0)
60279 +               xen_destroy_contiguous_region((unsigned long)buf, order);
60280 +}
60281 +
60282 +static int __init skbuff_init(void)
60283 +{
60284 +       static char name[MAX_SKBUFF_ORDER + 1][20];
60285 +       static char small_name[ARRAY_SIZE(skbuff_small)][20];
60286 +       unsigned long size;
60287 +       int i, order;
60288 +
60289 +       for (i = 0; i < ARRAY_SIZE(skbuff_small); i++) {
60290 +               size = skbuff_small[i].size;
60291 +               sprintf(small_name[i], "xen-skb-%lu", size);
60292 +               /*
60293 +                * No ctor/dtor: objects do not span page boundaries, and they
60294 +                * are only used on transmit path so no need for scrubbing.
60295 +                */
60296 +               skbuff_small[i].cachep = kmem_cache_create(
60297 +                       small_name[i], size, size, 0, NULL, NULL);
60298 +       }
60299 +
60300 +       for (order = 0; order <= MAX_SKBUFF_ORDER; order++) {
60301 +               size = PAGE_SIZE << order;
60302 +               sprintf(name[order], "xen-skb-%lu", size);
60303 +               if (is_running_on_xen() && is_initial_xendomain())
60304 +                       skbuff_order_cachep[order] = kmem_cache_create(
60305 +                               name[order], size, size, 0,
60306 +                               skbuff_ctor, skbuff_dtor);
60307 +               else
60308 +                       skbuff_order_cachep[order] = kmem_cache_create(
60309 +                               name[order], size, size, 0, NULL, NULL);
60310 +                       
60311 +       }
60312 +
60313 +       skbuff_cachep = skbuff_order_cachep[0];
60314 +
60315 +       return 0;
60316 +}
60317 +core_initcall(skbuff_init);
60318 +
60319 +EXPORT_SYMBOL(__dev_alloc_skb);
60320 diff -ruNp linux-2.6.19/drivers/xen/core/smpboot.c linux-2.6.19-xen-3.0.4/drivers/xen/core/smpboot.c
60321 --- linux-2.6.19/drivers/xen/core/smpboot.c     1970-01-01 00:00:00.000000000 +0000
60322 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/smpboot.c   2007-02-02 19:10:45.000000000 +0000
60323 @@ -0,0 +1,461 @@
60324 +/*
60325 + *     Xen SMP booting functions
60326 + *
60327 + *     See arch/i386/kernel/smpboot.c for copyright and credits for derived
60328 + *     portions of this file.
60329 + */
60330 +
60331 +#include <linux/module.h>
60332 +#include <linux/init.h>
60333 +#include <linux/kernel.h>
60334 +#include <linux/mm.h>
60335 +#include <linux/sched.h>
60336 +#include <linux/kernel_stat.h>
60337 +#include <linux/smp_lock.h>
60338 +#include <linux/irq.h>
60339 +#include <linux/bootmem.h>
60340 +#include <linux/notifier.h>
60341 +#include <linux/cpu.h>
60342 +#include <linux/percpu.h>
60343 +#include <asm/desc.h>
60344 +#include <asm/arch_hooks.h>
60345 +#include <asm/pgalloc.h>
60346 +#include <xen/evtchn.h>
60347 +#include <xen/interface/vcpu.h>
60348 +#include <xen/cpu_hotplug.h>
60349 +#include <xen/xenbus.h>
60350 +
60351 +extern irqreturn_t smp_reschedule_interrupt(int, void *);
60352 +extern irqreturn_t smp_call_function_interrupt(int, void *);
60353 +
60354 +extern int local_setup_timer(unsigned int cpu);
60355 +extern void local_teardown_timer(unsigned int cpu);
60356 +
60357 +extern void hypervisor_callback(void);
60358 +extern void failsafe_callback(void);
60359 +extern void system_call(void);
60360 +extern void smp_trap_init(trap_info_t *);
60361 +
60362 +/* Number of siblings per CPU package */
60363 +int smp_num_siblings = 1;
60364 +EXPORT_SYMBOL(smp_num_siblings);
60365 +#if defined(__i386__)
60366 +int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
60367 +#elif defined(__x86_64__)
60368 +u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
60369 +#endif
60370 +EXPORT_SYMBOL(cpu_llc_id);
60371 +
60372 +cpumask_t cpu_online_map;
60373 +EXPORT_SYMBOL(cpu_online_map);
60374 +cpumask_t cpu_possible_map;
60375 +EXPORT_SYMBOL(cpu_possible_map);
60376 +
60377 +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
60378 +EXPORT_SYMBOL(cpu_data);
60379 +
60380 +#ifdef CONFIG_HOTPLUG_CPU
60381 +DEFINE_PER_CPU(int, cpu_state) = { 0 };
60382 +#endif
60383 +
60384 +static DEFINE_PER_CPU(int, resched_irq);
60385 +static DEFINE_PER_CPU(int, callfunc_irq);
60386 +static char resched_name[NR_CPUS][15];
60387 +static char callfunc_name[NR_CPUS][15];
60388 +
60389 +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
60390 +
60391 +void *xquad_portio;
60392 +
60393 +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
60394 +EXPORT_SYMBOL(cpu_sibling_map);
60395 +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
60396 +EXPORT_SYMBOL(cpu_core_map);
60397 +
60398 +#if defined(__i386__)
60399 +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
60400 +EXPORT_SYMBOL(x86_cpu_to_apicid);
60401 +#endif
60402 +
60403 +void __init prefill_possible_map(void)
60404 +{
60405 +       int i, rc;
60406 +
60407 +       for (i = 0; i < NR_CPUS; i++) {
60408 +               rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
60409 +               if (rc >= 0)
60410 +                       cpu_set(i, cpu_possible_map);
60411 +       }
60412 +}
60413 +
60414 +void __init smp_alloc_memory(void)
60415 +{
60416 +}
60417 +
60418 +static inline void
60419 +set_cpu_sibling_map(int cpu)
60420 +{
60421 +       cpu_data[cpu].phys_proc_id = cpu;
60422 +       cpu_data[cpu].cpu_core_id = 0;
60423 +
60424 +       cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
60425 +       cpu_core_map[cpu]    = cpumask_of_cpu(cpu);
60426 +
60427 +       cpu_data[cpu].booted_cores = 1;
60428 +}
60429 +
60430 +static void
60431 +remove_siblinginfo(int cpu)
60432 +{
60433 +       cpu_data[cpu].phys_proc_id = BAD_APICID;
60434 +       cpu_data[cpu].cpu_core_id  = BAD_APICID;
60435 +
60436 +       cpus_clear(cpu_sibling_map[cpu]);
60437 +       cpus_clear(cpu_core_map[cpu]);
60438 +
60439 +       cpu_data[cpu].booted_cores = 0;
60440 +}
60441 +
60442 +static int xen_smp_intr_init(unsigned int cpu)
60443 +{
60444 +       int rc;
60445 +
60446 +       per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
60447 +
60448 +       sprintf(resched_name[cpu], "resched%d", cpu);
60449 +       rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
60450 +                                   cpu,
60451 +                                   smp_reschedule_interrupt,
60452 +                                   SA_INTERRUPT,
60453 +                                   resched_name[cpu],
60454 +                                   NULL);
60455 +       if (rc < 0)
60456 +               goto fail;
60457 +       per_cpu(resched_irq, cpu) = rc;
60458 +
60459 +       sprintf(callfunc_name[cpu], "callfunc%d", cpu);
60460 +       rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
60461 +                                   cpu,
60462 +                                   smp_call_function_interrupt,
60463 +                                   SA_INTERRUPT,
60464 +                                   callfunc_name[cpu],
60465 +                                   NULL);
60466 +       if (rc < 0)
60467 +               goto fail;
60468 +       per_cpu(callfunc_irq, cpu) = rc;
60469 +
60470 +       if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
60471 +               goto fail;
60472 +
60473 +       return 0;
60474 +
60475 + fail:
60476 +       if (per_cpu(resched_irq, cpu) >= 0)
60477 +               unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
60478 +       if (per_cpu(callfunc_irq, cpu) >= 0)
60479 +               unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
60480 +       return rc;
60481 +}
60482 +
60483 +#ifdef CONFIG_HOTPLUG_CPU
60484 +static void xen_smp_intr_exit(unsigned int cpu)
60485 +{
60486 +       if (cpu != 0)
60487 +               local_teardown_timer(cpu);
60488 +
60489 +       unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
60490 +       unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
60491 +}
60492 +#endif
60493 +
60494 +void cpu_bringup(void)
60495 +{
60496 +       cpu_init();
60497 +       touch_softlockup_watchdog();
60498 +       preempt_disable();
60499 +       local_irq_enable();
60500 +}
60501 +
60502 +static void cpu_bringup_and_idle(void)
60503 +{
60504 +       cpu_bringup();
60505 +       cpu_idle();
60506 +}
60507 +
60508 +void cpu_initialize_context(unsigned int cpu)
60509 +{
60510 +       vcpu_guest_context_t ctxt;
60511 +       struct task_struct *idle = idle_task(cpu);
60512 +#ifdef __x86_64__
60513 +       struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
60514 +#else
60515 +       struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
60516 +#endif
60517 +
60518 +       if (cpu == 0)
60519 +               return;
60520 +
60521 +       memset(&ctxt, 0, sizeof(ctxt));
60522 +
60523 +       ctxt.flags = VGCF_IN_KERNEL;
60524 +       ctxt.user_regs.ds = __USER_DS;
60525 +       ctxt.user_regs.es = __USER_DS;
60526 +       ctxt.user_regs.fs = 0;
60527 +       ctxt.user_regs.gs = 0;
60528 +       ctxt.user_regs.ss = __KERNEL_DS;
60529 +       ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
60530 +       ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
60531 +
60532 +       memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
60533 +
60534 +       smp_trap_init(ctxt.trap_ctxt);
60535 +
60536 +       ctxt.ldt_ents = 0;
60537 +
60538 +       ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
60539 +       ctxt.gdt_ents      = gdt_descr->size / 8;
60540 +
60541 +#ifdef __i386__
60542 +       ctxt.user_regs.cs = __KERNEL_CS;
60543 +       ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
60544 +
60545 +       ctxt.kernel_ss = __KERNEL_DS;
60546 +       ctxt.kernel_sp = idle->thread.esp0;
60547 +
60548 +       ctxt.event_callback_cs     = __KERNEL_CS;
60549 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
60550 +       ctxt.failsafe_callback_cs  = __KERNEL_CS;
60551 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
60552 +
60553 +       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
60554 +#else /* __x86_64__ */
60555 +       ctxt.user_regs.cs = __KERNEL_CS;
60556 +       ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
60557 +
60558 +       ctxt.kernel_ss = __KERNEL_DS;
60559 +       ctxt.kernel_sp = idle->thread.rsp0;
60560 +
60561 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
60562 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
60563 +       ctxt.syscall_callback_eip  = (unsigned long)system_call;
60564 +
60565 +       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
60566 +
60567 +       ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
60568 +#endif
60569 +
60570 +       BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
60571 +}
60572 +
60573 +void __init smp_prepare_cpus(unsigned int max_cpus)
60574 +{
60575 +       int cpu;
60576 +       struct task_struct *idle;
60577 +#ifdef __x86_64__
60578 +       struct desc_ptr *gdt_descr;
60579 +#else
60580 +       struct Xgt_desc_struct *gdt_descr;
60581 +#endif
60582 +
60583 +       boot_cpu_data.apicid = 0;
60584 +       cpu_data[0] = boot_cpu_data;
60585 +
60586 +       cpu_2_logical_apicid[0] = 0;
60587 +       x86_cpu_to_apicid[0] = 0;
60588 +
60589 +       current_thread_info()->cpu = 0;
60590 +
60591 +       for (cpu = 0; cpu < NR_CPUS; cpu++) {
60592 +               cpus_clear(cpu_sibling_map[cpu]);
60593 +               cpus_clear(cpu_core_map[cpu]);
60594 +       }
60595 +
60596 +       set_cpu_sibling_map(0);
60597 +
60598 +       if (xen_smp_intr_init(0))
60599 +               BUG();
60600 +
60601 +       /* Restrict the possible_map according to max_cpus. */
60602 +       while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
60603 +               for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
60604 +                       continue;
60605 +               cpu_clear(cpu, cpu_possible_map);
60606 +       }
60607 +
60608 +       for_each_possible_cpu (cpu) {
60609 +               if (cpu == 0)
60610 +                       continue;
60611 +
60612 +#ifdef __x86_64__
60613 +               gdt_descr = &cpu_gdt_descr[cpu];
60614 +#else
60615 +               gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
60616 +#endif
60617 +               gdt_descr->address = get_zeroed_page(GFP_KERNEL);
60618 +               if (unlikely(!gdt_descr->address)) {
60619 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
60620 +                              cpu);
60621 +                       continue;
60622 +               }
60623 +               gdt_descr->size = GDT_SIZE;
60624 +               memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
60625 +               make_page_readonly(
60626 +                       (void *)gdt_descr->address,
60627 +                       XENFEAT_writable_descriptor_tables);
60628 +
60629 +               cpu_data[cpu] = boot_cpu_data;
60630 +               cpu_data[cpu].apicid = cpu;
60631 +
60632 +               cpu_2_logical_apicid[cpu] = cpu;
60633 +               x86_cpu_to_apicid[cpu] = cpu;
60634 +
60635 +               idle = fork_idle(cpu);
60636 +               if (IS_ERR(idle))
60637 +                       panic("failed fork for CPU %d", cpu);
60638 +
60639 +#ifdef __x86_64__
60640 +               cpu_pda(cpu)->pcurrent = idle;
60641 +               cpu_pda(cpu)->cpunumber = cpu;
60642 +               clear_ti_thread_flag(idle->thread_info, TIF_FORK);
60643 +#endif
60644 +
60645 +               irq_ctx_init(cpu);
60646 +
60647 +#ifdef CONFIG_HOTPLUG_CPU
60648 +               if (is_initial_xendomain())
60649 +                       cpu_set(cpu, cpu_present_map);
60650 +#else
60651 +               cpu_set(cpu, cpu_present_map);
60652 +#endif
60653 +
60654 +               cpu_initialize_context(cpu);
60655 +       }
60656 +
60657 +       init_xenbus_allowed_cpumask();
60658 +
60659 +       /*
60660 +        * Here we can be sure that there is an IO-APIC in the system. Let's
60661 +        * go and set it up:
60662 +        */
60663 +#ifdef CONFIG_X86_IO_APIC
60664 +       if (!skip_ioapic_setup && nr_ioapics)
60665 +               setup_IO_APIC();
60666 +#endif
60667 +}
60668 +
60669 +void __devinit smp_prepare_boot_cpu(void)
60670 +{
60671 +}
60672 +
60673 +#ifdef CONFIG_HOTPLUG_CPU
60674 +
60675 +/*
60676 + * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
60677 + * But do it early enough to catch critical for_each_present_cpu() loops
60678 + * in i386-specific code.
60679 + */
60680 +static int __init initialize_cpu_present_map(void)
60681 +{
60682 +       cpu_present_map = cpu_possible_map;
60683 +       return 0;
60684 +}
60685 +core_initcall(initialize_cpu_present_map);
60686 +
60687 +int __cpu_disable(void)
60688 +{
60689 +       cpumask_t map = cpu_online_map;
60690 +       int cpu = smp_processor_id();
60691 +
60692 +       if (cpu == 0)
60693 +               return -EBUSY;
60694 +
60695 +       remove_siblinginfo(cpu);
60696 +
60697 +       cpu_clear(cpu, map);
60698 +       fixup_irqs(map);
60699 +       cpu_clear(cpu, cpu_online_map);
60700 +
60701 +       return 0;
60702 +}
60703 +
60704 +void __cpu_die(unsigned int cpu)
60705 +{
60706 +       while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
60707 +               current->state = TASK_UNINTERRUPTIBLE;
60708 +               schedule_timeout(HZ/10);
60709 +       }
60710 +
60711 +       xen_smp_intr_exit(cpu);
60712 +
60713 +       if (num_online_cpus() == 1)
60714 +               alternatives_smp_switch(0);
60715 +}
60716 +
60717 +#else /* !CONFIG_HOTPLUG_CPU */
60718 +
60719 +int __cpu_disable(void)
60720 +{
60721 +       return -ENOSYS;
60722 +}
60723 +
60724 +void __cpu_die(unsigned int cpu)
60725 +{
60726 +       BUG();
60727 +}
60728 +
60729 +#endif /* CONFIG_HOTPLUG_CPU */
60730 +
60731 +int __devinit __cpu_up(unsigned int cpu)
60732 +{
60733 +       int rc;
60734 +
60735 +       rc = cpu_up_check(cpu);
60736 +       if (rc)
60737 +               return rc;
60738 +
60739 +       if (num_online_cpus() == 1)
60740 +               alternatives_smp_switch(1);
60741 +
60742 +       /* This must be done before setting cpu_online_map */
60743 +       set_cpu_sibling_map(cpu);
60744 +       wmb();
60745 +
60746 +       rc = xen_smp_intr_init(cpu);
60747 +       if (rc) {
60748 +               remove_siblinginfo(cpu);
60749 +               return rc;
60750 +       }
60751 +
60752 +       cpu_set(cpu, cpu_online_map);
60753 +
60754 +       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
60755 +       BUG_ON(rc);
60756 +
60757 +       return 0;
60758 +}
60759 +
60760 +void __init smp_cpus_done(unsigned int max_cpus)
60761 +{
60762 +}
60763 +
60764 +#ifdef CONFIG_X86_MPPARSE
60765 +/*
60766 + * If the BIOS enumerates physical processors before logical,
60767 + * maxcpus=N at enumeration-time can be used to disable HT.
60768 + */
60769 +static int __init parse_maxcpus(char *arg)
60770 +{
60771 +       extern unsigned int maxcpus;
60772 +
60773 +       maxcpus = simple_strtoul(arg, NULL, 0);
60774 +       return 0;
60775 +}
60776 +early_param("maxcpus", parse_maxcpus);
60777 +#endif
60778 +
60779 +#if defined(CONFIG_XEN_UNPRIVILEGED_GUEST) && defined(CONFIG_X86_32)
60780 +int setup_profiling_timer(unsigned int multiplier)
60781 +{
60782 +       return -EINVAL;
60783 +}
60784 +#endif
60785 diff -ruNp linux-2.6.19/drivers/xen/core/xen_proc.c linux-2.6.19-xen-3.0.4/drivers/xen/core/xen_proc.c
60786 --- linux-2.6.19/drivers/xen/core/xen_proc.c    1970-01-01 00:00:00.000000000 +0000
60787 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/xen_proc.c  2007-02-02 19:10:45.000000000 +0000
60788 @@ -0,0 +1,18 @@
60789 +
60790 +#include <linux/proc_fs.h>
60791 +#include <xen/xen_proc.h>
60792 +
60793 +static struct proc_dir_entry *xen_base;
60794 +
60795 +struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
60796 +{
60797 +       if ( xen_base == NULL )
60798 +               if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
60799 +                       panic("Couldn't create /proc/xen");
60800 +       return create_proc_entry(name, mode, xen_base);
60801 +}
60802 +
60803 +void remove_xen_proc_entry(const char *name)
60804 +{
60805 +       remove_proc_entry(name, xen_base);
60806 +}
60807 diff -ruNp linux-2.6.19/drivers/xen/core/xen_sysfs.c linux-2.6.19-xen-3.0.4/drivers/xen/core/xen_sysfs.c
60808 --- linux-2.6.19/drivers/xen/core/xen_sysfs.c   1970-01-01 00:00:00.000000000 +0000
60809 +++ linux-2.6.19-xen-3.0.4/drivers/xen/core/xen_sysfs.c 2007-02-02 19:10:45.000000000 +0000
60810 @@ -0,0 +1,378 @@
60811 +/*
60812 + *  copyright (c) 2006 IBM Corporation
60813 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
60814 + *
60815 + *  This program is free software; you can redistribute it and/or modify
60816 + *  it under the terms of the GNU General Public License version 2 as
60817 + *  published by the Free Software Foundation.
60818 + */
60819 +
60820 +#include <linux/err.h>
60821 +#include <linux/kernel.h>
60822 +#include <linux/module.h>
60823 +#include <linux/init.h>
60824 +#include <asm/hypervisor.h>
60825 +#include <xen/features.h>
60826 +#include <xen/hypervisor_sysfs.h>
60827 +#include <xen/xenbus.h>
60828 +
60829 +MODULE_LICENSE("GPL");
60830 +MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
60831 +
60832 +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
60833 +{
60834 +       return sprintf(buffer, "xen\n");
60835 +}
60836 +
60837 +HYPERVISOR_ATTR_RO(type);
60838 +
60839 +static int __init xen_sysfs_type_init(void)
60840 +{
60841 +       return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
60842 +}
60843 +
60844 +static void xen_sysfs_type_destroy(void)
60845 +{
60846 +       sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
60847 +}
60848 +
60849 +/* xen version attributes */
60850 +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
60851 +{
60852 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
60853 +       if (version)
60854 +               return sprintf(buffer, "%d\n", version >> 16);
60855 +       return -ENODEV;
60856 +}
60857 +
60858 +HYPERVISOR_ATTR_RO(major);
60859 +
60860 +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
60861 +{
60862 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
60863 +       if (version)
60864 +               return sprintf(buffer, "%d\n", version & 0xff);
60865 +       return -ENODEV;
60866 +}
60867 +
60868 +HYPERVISOR_ATTR_RO(minor);
60869 +
60870 +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
60871 +{
60872 +       int ret = -ENOMEM;
60873 +       char *extra;
60874 +
60875 +       extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
60876 +       if (extra) {
60877 +               ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
60878 +               if (!ret)
60879 +                       ret = sprintf(buffer, "%s\n", extra);
60880 +               kfree(extra);
60881 +       }
60882 +
60883 +       return ret;
60884 +}
60885 +
60886 +HYPERVISOR_ATTR_RO(extra);
60887 +
60888 +static struct attribute *version_attrs[] = {
60889 +       &major_attr.attr,
60890 +       &minor_attr.attr,
60891 +       &extra_attr.attr,
60892 +       NULL
60893 +};
60894 +
60895 +static struct attribute_group version_group = {
60896 +       .name = "version",
60897 +       .attrs = version_attrs,
60898 +};
60899 +
60900 +static int __init xen_sysfs_version_init(void)
60901 +{
60902 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
60903 +                                 &version_group);
60904 +}
60905 +
60906 +static void xen_sysfs_version_destroy(void)
60907 +{
60908 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
60909 +}
60910 +
60911 +/* UUID */
60912 +
60913 +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
60914 +{
60915 +       char *vm, *val;
60916 +       int ret;
60917 +
60918 +       vm = xenbus_read(XBT_NIL, "vm", "", NULL);
60919 +       if (IS_ERR(vm))
60920 +               return PTR_ERR(vm);
60921 +       val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
60922 +       kfree(vm);
60923 +       if (IS_ERR(val))
60924 +               return PTR_ERR(val);
60925 +       ret = sprintf(buffer, "%s\n", val);
60926 +       kfree(val);
60927 +       return ret;
60928 +}
60929 +
60930 +HYPERVISOR_ATTR_RO(uuid);
60931 +
60932 +static int __init xen_sysfs_uuid_init(void)
60933 +{
60934 +       return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
60935 +}
60936 +
60937 +static void xen_sysfs_uuid_destroy(void)
60938 +{
60939 +       sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
60940 +}
60941 +
60942 +/* xen compilation attributes */
60943 +
60944 +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
60945 +{
60946 +       int ret = -ENOMEM;
60947 +       struct xen_compile_info *info;
60948 +
60949 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
60950 +       if (info) {
60951 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
60952 +               if (!ret)
60953 +                       ret = sprintf(buffer, "%s\n", info->compiler);
60954 +               kfree(info);
60955 +       }
60956 +
60957 +       return ret;
60958 +}
60959 +
60960 +HYPERVISOR_ATTR_RO(compiler);
60961 +
60962 +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
60963 +{
60964 +       int ret = -ENOMEM;
60965 +       struct xen_compile_info *info;
60966 +
60967 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
60968 +       if (info) {
60969 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
60970 +               if (!ret)
60971 +                       ret = sprintf(buffer, "%s\n", info->compile_by);
60972 +               kfree(info);
60973 +       }
60974 +
60975 +       return ret;
60976 +}
60977 +
60978 +HYPERVISOR_ATTR_RO(compiled_by);
60979 +
60980 +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
60981 +{
60982 +       int ret = -ENOMEM;
60983 +       struct xen_compile_info *info;
60984 +
60985 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
60986 +       if (info) {
60987 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
60988 +               if (!ret)
60989 +                       ret = sprintf(buffer, "%s\n", info->compile_date);
60990 +               kfree(info);
60991 +       }
60992 +
60993 +       return ret;
60994 +}
60995 +
60996 +HYPERVISOR_ATTR_RO(compile_date);
60997 +
60998 +static struct attribute *xen_compile_attrs[] = {
60999 +       &compiler_attr.attr,
61000 +       &compiled_by_attr.attr,
61001 +       &compile_date_attr.attr,
61002 +       NULL
61003 +};
61004 +
61005 +static struct attribute_group xen_compilation_group = {
61006 +       .name = "compilation",
61007 +       .attrs = xen_compile_attrs,
61008 +};
61009 +
61010 +int __init static xen_compilation_init(void)
61011 +{
61012 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
61013 +                                 &xen_compilation_group);
61014 +}
61015 +
61016 +static void xen_compilation_destroy(void)
61017 +{
61018 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj,
61019 +                          &xen_compilation_group);
61020 +}
61021 +
61022 +/* xen properties info */
61023 +
61024 +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
61025 +{
61026 +       int ret = -ENOMEM;
61027 +       char *caps;
61028 +
61029 +       caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
61030 +       if (caps) {
61031 +               ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
61032 +               if (!ret)
61033 +                       ret = sprintf(buffer, "%s\n", caps);
61034 +               kfree(caps);
61035 +       }
61036 +
61037 +       return ret;
61038 +}
61039 +
61040 +HYPERVISOR_ATTR_RO(capabilities);
61041 +
61042 +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
61043 +{
61044 +       int ret = -ENOMEM;
61045 +       char *cset;
61046 +
61047 +       cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
61048 +       if (cset) {
61049 +               ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
61050 +               if (!ret)
61051 +                       ret = sprintf(buffer, "%s\n", cset);
61052 +               kfree(cset);
61053 +       }
61054 +
61055 +       return ret;
61056 +}
61057 +
61058 +HYPERVISOR_ATTR_RO(changeset);
61059 +
61060 +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
61061 +{
61062 +       int ret = -ENOMEM;
61063 +       struct xen_platform_parameters *parms;
61064 +
61065 +       parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
61066 +       if (parms) {
61067 +               ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
61068 +                                            parms);
61069 +               if (!ret)
61070 +                       ret = sprintf(buffer, "%lx\n", parms->virt_start);
61071 +               kfree(parms);
61072 +       }
61073 +
61074 +       return ret;
61075 +}
61076 +
61077 +HYPERVISOR_ATTR_RO(virtual_start);
61078 +
61079 +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
61080 +{
61081 +       int ret;
61082 +
61083 +       ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
61084 +       if (ret > 0)
61085 +               ret = sprintf(buffer, "%x\n", ret);
61086 +
61087 +       return ret;
61088 +}
61089 +
61090 +HYPERVISOR_ATTR_RO(pagesize);
61091 +
61092 +/* eventually there will be several more features to export */
61093 +static ssize_t xen_feature_show(int index, char *buffer)
61094 +{
61095 +       int ret = -ENOMEM;
61096 +       struct xen_feature_info *info;
61097 +
61098 +       info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
61099 +       if (info) {
61100 +               info->submap_idx = index;
61101 +               ret = HYPERVISOR_xen_version(XENVER_get_features, info);
61102 +               if (!ret)
61103 +                       ret = sprintf(buffer, "%d\n", info->submap);
61104 +               kfree(info);
61105 +       }
61106 +
61107 +       return ret;
61108 +}
61109 +
61110 +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
61111 +{
61112 +       return xen_feature_show(XENFEAT_writable_page_tables, buffer);
61113 +}
61114 +
61115 +HYPERVISOR_ATTR_RO(writable_pt);
61116 +
61117 +static struct attribute *xen_properties_attrs[] = {
61118 +       &capabilities_attr.attr,
61119 +       &changeset_attr.attr,
61120 +       &virtual_start_attr.attr,
61121 +       &pagesize_attr.attr,
61122 +       &writable_pt_attr.attr,
61123 +       NULL
61124 +};
61125 +
61126 +static struct attribute_group xen_properties_group = {
61127 +       .name = "properties",
61128 +       .attrs = xen_properties_attrs,
61129 +};
61130 +
61131 +static int __init xen_properties_init(void)
61132 +{
61133 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
61134 +                                 &xen_properties_group);
61135 +}
61136 +
61137 +static void xen_properties_destroy(void)
61138 +{
61139 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj,
61140 +                          &xen_properties_group);
61141 +}
61142 +
61143 +static int __init hyper_sysfs_init(void)
61144 +{
61145 +       int ret;
61146 +
61147 +       if (!is_running_on_xen())
61148 +               return -ENODEV;
61149 +
61150 +       ret = xen_sysfs_type_init();
61151 +       if (ret)
61152 +               goto out;
61153 +       ret = xen_sysfs_version_init();
61154 +       if (ret)
61155 +               goto version_out;
61156 +       ret = xen_compilation_init();
61157 +       if (ret)
61158 +               goto comp_out;
61159 +       ret = xen_sysfs_uuid_init();
61160 +       if (ret)
61161 +               goto uuid_out;
61162 +       ret = xen_properties_init();
61163 +       if (!ret)
61164 +               goto out;
61165 +
61166 +       xen_sysfs_uuid_destroy();
61167 +uuid_out:
61168 +       xen_compilation_destroy();
61169 +comp_out:
61170 +       xen_sysfs_version_destroy();
61171 +version_out:
61172 +       xen_sysfs_type_destroy();
61173 +out:
61174 +       return ret;
61175 +}
61176 +
61177 +static void hyper_sysfs_exit(void)
61178 +{
61179 +       xen_properties_destroy();
61180 +       xen_compilation_destroy();
61181 +       xen_sysfs_uuid_destroy();
61182 +       xen_sysfs_version_destroy();
61183 +       xen_sysfs_type_destroy();
61184 +
61185 +}
61186 +
61187 +module_init(hyper_sysfs_init);
61188 +module_exit(hyper_sysfs_exit);
61189 diff -ruNp linux-2.6.19/drivers/xen/evtchn/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/evtchn/Makefile
61190 --- linux-2.6.19/drivers/xen/evtchn/Makefile    1970-01-01 00:00:00.000000000 +0000
61191 +++ linux-2.6.19-xen-3.0.4/drivers/xen/evtchn/Makefile  2007-02-02 19:10:45.000000000 +0000
61192 @@ -0,0 +1,2 @@
61193 +
61194 +obj-y  := evtchn.o
61195 diff -ruNp linux-2.6.19/drivers/xen/evtchn/evtchn.c linux-2.6.19-xen-3.0.4/drivers/xen/evtchn/evtchn.c
61196 --- linux-2.6.19/drivers/xen/evtchn/evtchn.c    1970-01-01 00:00:00.000000000 +0000
61197 +++ linux-2.6.19-xen-3.0.4/drivers/xen/evtchn/evtchn.c  2007-02-02 19:10:45.000000000 +0000
61198 @@ -0,0 +1,456 @@
61199 +/******************************************************************************
61200 + * evtchn.c
61201 + * 
61202 + * Driver for receiving and demuxing event-channel signals.
61203 + * 
61204 + * Copyright (c) 2004-2005, K A Fraser
61205 + * Multi-process extensions Copyright (c) 2004, Steven Smith
61206 + * 
61207 + * This program is free software; you can redistribute it and/or
61208 + * modify it under the terms of the GNU General Public License version 2
61209 + * as published by the Free Software Foundation; or, when distributed
61210 + * separately from the Linux kernel or incorporated into other
61211 + * software packages, subject to the following license:
61212 + * 
61213 + * Permission is hereby granted, free of charge, to any person obtaining a copy
61214 + * of this source file (the "Software"), to deal in the Software without
61215 + * restriction, including without limitation the rights to use, copy, modify,
61216 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
61217 + * and to permit persons to whom the Software is furnished to do so, subject to
61218 + * the following conditions:
61219 + * 
61220 + * The above copyright notice and this permission notice shall be included in
61221 + * all copies or substantial portions of the Software.
61222 + * 
61223 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61224 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61225 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61226 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61227 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
61228 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
61229 + * IN THE SOFTWARE.
61230 + */
61231 +
61232 +#include <linux/module.h>
61233 +#include <linux/kernel.h>
61234 +#include <linux/sched.h>
61235 +#include <linux/slab.h>
61236 +#include <linux/string.h>
61237 +#include <linux/errno.h>
61238 +#include <linux/fs.h>
61239 +#include <linux/errno.h>
61240 +#include <linux/miscdevice.h>
61241 +#include <linux/major.h>
61242 +#include <linux/proc_fs.h>
61243 +#include <linux/stat.h>
61244 +#include <linux/poll.h>
61245 +#include <linux/irq.h>
61246 +#include <linux/init.h>
61247 +#include <linux/gfp.h>
61248 +#include <xen/evtchn.h>
61249 +#include <xen/public/evtchn.h>
61250 +
61251 +struct per_user_data {
61252 +       /* Notification ring, accessed via /dev/xen/evtchn. */
61253 +#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
61254 +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
61255 +       evtchn_port_t *ring;
61256 +       unsigned int ring_cons, ring_prod, ring_overflow;
61257 +
61258 +       /* Processes wait on this queue when ring is empty. */
61259 +       wait_queue_head_t evtchn_wait;
61260 +       struct fasync_struct *evtchn_async_queue;
61261 +};
61262 +
61263 +/* Who's bound to each port? */
61264 +static struct per_user_data *port_user[NR_EVENT_CHANNELS];
61265 +static spinlock_t port_user_lock;
61266 +
61267 +void evtchn_device_upcall(int port)
61268 +{
61269 +       struct per_user_data *u;
61270 +
61271 +       spin_lock(&port_user_lock);
61272 +
61273 +       mask_evtchn(port);
61274 +       clear_evtchn(port);
61275 +
61276 +       if ((u = port_user[port]) != NULL) {
61277 +               if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
61278 +                       u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
61279 +                       if (u->ring_cons == u->ring_prod++) {
61280 +                               wake_up_interruptible(&u->evtchn_wait);
61281 +                               kill_fasync(&u->evtchn_async_queue,
61282 +                                           SIGIO, POLL_IN);
61283 +                       }
61284 +               } else {
61285 +                       u->ring_overflow = 1;
61286 +               }
61287 +       }
61288 +
61289 +       spin_unlock(&port_user_lock);
61290 +}
61291 +
61292 +static ssize_t evtchn_read(struct file *file, char __user *buf,
61293 +                          size_t count, loff_t *ppos)
61294 +{
61295 +       int rc;
61296 +       unsigned int c, p, bytes1 = 0, bytes2 = 0;
61297 +       struct per_user_data *u = file->private_data;
61298 +
61299 +       /* Whole number of ports. */
61300 +       count &= ~(sizeof(evtchn_port_t)-1);
61301 +
61302 +       if (count == 0)
61303 +               return 0;
61304 +
61305 +       if (count > PAGE_SIZE)
61306 +               count = PAGE_SIZE;
61307 +
61308 +       for (;;) {
61309 +               if (u->ring_overflow)
61310 +                       return -EFBIG;
61311 +
61312 +               if ((c = u->ring_cons) != (p = u->ring_prod))
61313 +                       break;
61314 +
61315 +               if (file->f_flags & O_NONBLOCK)
61316 +                       return -EAGAIN;
61317 +
61318 +               rc = wait_event_interruptible(
61319 +                       u->evtchn_wait, u->ring_cons != u->ring_prod);
61320 +               if (rc)
61321 +                       return rc;
61322 +       }
61323 +
61324 +       /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
61325 +       if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
61326 +               bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
61327 +                       sizeof(evtchn_port_t);
61328 +               bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
61329 +       } else {
61330 +               bytes1 = (p - c) * sizeof(evtchn_port_t);
61331 +               bytes2 = 0;
61332 +       }
61333 +
61334 +       /* Truncate chunks according to caller's maximum byte count. */
61335 +       if (bytes1 > count) {
61336 +               bytes1 = count;
61337 +               bytes2 = 0;
61338 +       } else if ((bytes1 + bytes2) > count) {
61339 +               bytes2 = count - bytes1;
61340 +       }
61341 +
61342 +       if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
61343 +           ((bytes2 != 0) &&
61344 +            copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
61345 +               return -EFAULT;
61346 +
61347 +       u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
61348 +
61349 +       return bytes1 + bytes2;
61350 +}
61351 +
61352 +static ssize_t evtchn_write(struct file *file, const char __user *buf,
61353 +                           size_t count, loff_t *ppos)
61354 +{
61355 +       int  rc, i;
61356 +       evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
61357 +       struct per_user_data *u = file->private_data;
61358 +
61359 +       if (kbuf == NULL)
61360 +               return -ENOMEM;
61361 +
61362 +       /* Whole number of ports. */
61363 +       count &= ~(sizeof(evtchn_port_t)-1);
61364 +
61365 +       if (count == 0) {
61366 +               rc = 0;
61367 +               goto out;
61368 +       }
61369 +
61370 +       if (count > PAGE_SIZE)
61371 +               count = PAGE_SIZE;
61372 +
61373 +       if (copy_from_user(kbuf, buf, count) != 0) {
61374 +               rc = -EFAULT;
61375 +               goto out;
61376 +       }
61377 +
61378 +       spin_lock_irq(&port_user_lock);
61379 +       for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
61380 +               if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
61381 +                       unmask_evtchn(kbuf[i]);
61382 +       spin_unlock_irq(&port_user_lock);
61383 +
61384 +       rc = count;
61385 +
61386 + out:
61387 +       free_page((unsigned long)kbuf);
61388 +       return rc;
61389 +}
61390 +
61391 +static void evtchn_bind_to_user(struct per_user_data *u, int port)
61392 +{
61393 +       spin_lock_irq(&port_user_lock);
61394 +       BUG_ON(port_user[port] != NULL);
61395 +       port_user[port] = u;
61396 +       unmask_evtchn(port);
61397 +       spin_unlock_irq(&port_user_lock);
61398 +}
61399 +
61400 +static int evtchn_ioctl(struct inode *inode, struct file *file,
61401 +                       unsigned int cmd, unsigned long arg)
61402 +{
61403 +       int rc;
61404 +       struct per_user_data *u = file->private_data;
61405 +       void __user *uarg = (void __user *) arg;
61406 +
61407 +       switch (cmd) {
61408 +       case IOCTL_EVTCHN_BIND_VIRQ: {
61409 +               struct ioctl_evtchn_bind_virq bind;
61410 +               struct evtchn_bind_virq bind_virq;
61411 +
61412 +               rc = -EFAULT;
61413 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
61414 +                       break;
61415 +
61416 +               bind_virq.virq = bind.virq;
61417 +               bind_virq.vcpu = 0;
61418 +               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
61419 +                                                &bind_virq);
61420 +               if (rc != 0)
61421 +                       break;
61422 +
61423 +               rc = bind_virq.port;
61424 +               evtchn_bind_to_user(u, rc);
61425 +               break;
61426 +       }
61427 +
61428 +       case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
61429 +               struct ioctl_evtchn_bind_interdomain bind;
61430 +               struct evtchn_bind_interdomain bind_interdomain;
61431 +
61432 +               rc = -EFAULT;
61433 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
61434 +                       break;
61435 +
61436 +               bind_interdomain.remote_dom  = bind.remote_domain;
61437 +               bind_interdomain.remote_port = bind.remote_port;
61438 +               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
61439 +                                                &bind_interdomain);
61440 +               if (rc != 0)
61441 +                       break;
61442 +
61443 +               rc = bind_interdomain.local_port;
61444 +               evtchn_bind_to_user(u, rc);
61445 +               break;
61446 +       }
61447 +
61448 +       case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
61449 +               struct ioctl_evtchn_bind_unbound_port bind;
61450 +               struct evtchn_alloc_unbound alloc_unbound;
61451 +
61452 +               rc = -EFAULT;
61453 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
61454 +                       break;
61455 +
61456 +               alloc_unbound.dom        = DOMID_SELF;
61457 +               alloc_unbound.remote_dom = bind.remote_domain;
61458 +               rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
61459 +                                                &alloc_unbound);
61460 +               if (rc != 0)
61461 +                       break;
61462 +
61463 +               rc = alloc_unbound.port;
61464 +               evtchn_bind_to_user(u, rc);
61465 +               break;
61466 +       }
61467 +
61468 +       case IOCTL_EVTCHN_UNBIND: {
61469 +               struct ioctl_evtchn_unbind unbind;
61470 +               struct evtchn_close close;
61471 +               int ret;
61472 +
61473 +               rc = -EFAULT;
61474 +               if (copy_from_user(&unbind, uarg, sizeof(unbind)))
61475 +                       break;
61476 +
61477 +               rc = -EINVAL;
61478 +               if (unbind.port >= NR_EVENT_CHANNELS)
61479 +                       break;
61480 +
61481 +               spin_lock_irq(&port_user_lock);
61482 +    
61483 +               rc = -ENOTCONN;
61484 +               if (port_user[unbind.port] != u) {
61485 +                       spin_unlock_irq(&port_user_lock);
61486 +                       break;
61487 +               }
61488 +
61489 +               port_user[unbind.port] = NULL;
61490 +               mask_evtchn(unbind.port);
61491 +
61492 +               spin_unlock_irq(&port_user_lock);
61493 +
61494 +               close.port = unbind.port;
61495 +               ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
61496 +               BUG_ON(ret);
61497 +
61498 +               rc = 0;
61499 +               break;
61500 +       }
61501 +
61502 +       case IOCTL_EVTCHN_NOTIFY: {
61503 +               struct ioctl_evtchn_notify notify;
61504 +
61505 +               rc = -EFAULT;
61506 +               if (copy_from_user(&notify, uarg, sizeof(notify)))
61507 +                       break;
61508 +
61509 +               if (notify.port >= NR_EVENT_CHANNELS) {
61510 +                       rc = -EINVAL;
61511 +               } else if (port_user[notify.port] != u) {
61512 +                       rc = -ENOTCONN;
61513 +               } else {
61514 +                       notify_remote_via_evtchn(notify.port);
61515 +                       rc = 0;
61516 +               }
61517 +               break;
61518 +       }
61519 +
61520 +       case IOCTL_EVTCHN_RESET: {
61521 +               /* Initialise the ring to empty. Clear errors. */
61522 +               spin_lock_irq(&port_user_lock);
61523 +               u->ring_cons = u->ring_prod = u->ring_overflow = 0;
61524 +               spin_unlock_irq(&port_user_lock);
61525 +               rc = 0;
61526 +               break;
61527 +       }
61528 +
61529 +       default:
61530 +               rc = -ENOSYS;
61531 +               break;
61532 +       }
61533 +
61534 +       return rc;
61535 +}
61536 +
61537 +static unsigned int evtchn_poll(struct file *file, poll_table *wait)
61538 +{
61539 +       unsigned int mask = POLLOUT | POLLWRNORM;
61540 +       struct per_user_data *u = file->private_data;
61541 +
61542 +       poll_wait(file, &u->evtchn_wait, wait);
61543 +       if (u->ring_cons != u->ring_prod)
61544 +               mask |= POLLIN | POLLRDNORM;
61545 +       if (u->ring_overflow)
61546 +               mask = POLLERR;
61547 +       return mask;
61548 +}
61549 +
61550 +static int evtchn_fasync(int fd, struct file *filp, int on)
61551 +{
61552 +       struct per_user_data *u = filp->private_data;
61553 +       return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
61554 +}
61555 +
61556 +static int evtchn_open(struct inode *inode, struct file *filp)
61557 +{
61558 +       struct per_user_data *u;
61559 +
61560 +       if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL)
61561 +               return -ENOMEM;
61562 +
61563 +       memset(u, 0, sizeof(*u));
61564 +       init_waitqueue_head(&u->evtchn_wait);
61565 +
61566 +       u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
61567 +       if (u->ring == NULL) {
61568 +               kfree(u);
61569 +               return -ENOMEM;
61570 +       }
61571 +
61572 +       filp->private_data = u;
61573 +
61574 +       return 0;
61575 +}
61576 +
61577 +static int evtchn_release(struct inode *inode, struct file *filp)
61578 +{
61579 +       int i;
61580 +       struct per_user_data *u = filp->private_data;
61581 +       struct evtchn_close close;
61582 +
61583 +       spin_lock_irq(&port_user_lock);
61584 +
61585 +       free_page((unsigned long)u->ring);
61586 +
61587 +       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
61588 +               int ret;
61589 +               if (port_user[i] != u)
61590 +                       continue;
61591 +
61592 +               port_user[i] = NULL;
61593 +               mask_evtchn(i);
61594 +
61595 +               close.port = i;
61596 +               ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
61597 +               BUG_ON(ret);
61598 +       }
61599 +
61600 +       spin_unlock_irq(&port_user_lock);
61601 +
61602 +       kfree(u);
61603 +
61604 +       return 0;
61605 +}
61606 +
61607 +static struct file_operations evtchn_fops = {
61608 +       .owner   = THIS_MODULE,
61609 +       .read    = evtchn_read,
61610 +       .write   = evtchn_write,
61611 +       .ioctl   = evtchn_ioctl,
61612 +       .poll    = evtchn_poll,
61613 +       .fasync  = evtchn_fasync,
61614 +       .open    = evtchn_open,
61615 +       .release = evtchn_release,
61616 +};
61617 +
61618 +static struct miscdevice evtchn_miscdev = {
61619 +       .minor        = MISC_DYNAMIC_MINOR,
61620 +       .name         = "evtchn",
61621 +       .fops         = &evtchn_fops,
61622 +};
61623 +
61624 +static int __init evtchn_init(void)
61625 +{
61626 +       int err;
61627 +
61628 +       if (!is_running_on_xen())
61629 +               return -ENODEV;
61630 +
61631 +       spin_lock_init(&port_user_lock);
61632 +       memset(port_user, 0, sizeof(port_user));
61633 +
61634 +       /* Create '/dev/misc/evtchn'. */
61635 +       err = misc_register(&evtchn_miscdev);
61636 +       if (err != 0) {
61637 +               printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
61638 +               return err;
61639 +       }
61640 +
61641 +       printk("Event-channel device installed.\n");
61642 +
61643 +       return 0;
61644 +}
61645 +
61646 +static void evtchn_cleanup(void)
61647 +{
61648 +       misc_deregister(&evtchn_miscdev);
61649 +}
61650 +
61651 +module_init(evtchn_init);
61652 +module_exit(evtchn_cleanup);
61653 +
61654 +MODULE_LICENSE("Dual BSD/GPL");
61655 diff -ruNp linux-2.6.19/drivers/xen/fbfront/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/fbfront/Makefile
61656 --- linux-2.6.19/drivers/xen/fbfront/Makefile   1970-01-01 00:00:00.000000000 +0000
61657 +++ linux-2.6.19-xen-3.0.4/drivers/xen/fbfront/Makefile 2007-02-02 19:10:45.000000000 +0000
61658 @@ -0,0 +1,2 @@
61659 +obj-$(CONFIG_XEN_FRAMEBUFFER)  := xenfb.o
61660 +obj-$(CONFIG_XEN_KEYBOARD)     += xenkbd.o
61661 diff -ruNp linux-2.6.19/drivers/xen/fbfront/xenfb.c linux-2.6.19-xen-3.0.4/drivers/xen/fbfront/xenfb.c
61662 --- linux-2.6.19/drivers/xen/fbfront/xenfb.c    1970-01-01 00:00:00.000000000 +0000
61663 +++ linux-2.6.19-xen-3.0.4/drivers/xen/fbfront/xenfb.c  2007-02-02 19:10:45.000000000 +0000
61664 @@ -0,0 +1,750 @@
61665 +/*
61666 + * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device
61667 + *
61668 + * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com>
61669 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
61670 + *
61671 + *  Based on linux/drivers/video/q40fb.c
61672 + *
61673 + *  This file is subject to the terms and conditions of the GNU General Public
61674 + *  License. See the file COPYING in the main directory of this archive for
61675 + *  more details.
61676 + */
61677 +
61678 +/*
61679 + * TODO:
61680 + *
61681 + * Switch to grant tables when they become capable of dealing with the
61682 + * frame buffer.
61683 + */
61684 +
61685 +#include <linux/kernel.h>
61686 +#include <linux/errno.h>
61687 +#include <linux/fb.h>
61688 +#include <linux/module.h>
61689 +#include <linux/vmalloc.h>
61690 +#include <linux/mm.h>
61691 +#include <asm/hypervisor.h>
61692 +#include <xen/evtchn.h>
61693 +#include <xen/interface/io/fbif.h>
61694 +#include <xen/xenbus.h>
61695 +#include <linux/kthread.h>
61696 +
61697 +struct xenfb_mapping
61698 +{
61699 +       struct list_head        link;
61700 +       struct vm_area_struct   *vma;
61701 +       atomic_t                map_refs;
61702 +       int                     faults;
61703 +       struct xenfb_info       *info;
61704 +};
61705 +
61706 +struct xenfb_info
61707 +{
61708 +       struct task_struct      *kthread;
61709 +       wait_queue_head_t       wq;
61710 +
61711 +       unsigned char           *fb;
61712 +       struct fb_info          *fb_info;
61713 +       struct timer_list       refresh;
61714 +       int                     dirty;
61715 +       int                     x1, y1, x2, y2; /* dirty rectangle,
61716 +                                                  protected by dirty_lock */
61717 +       spinlock_t              dirty_lock;
61718 +       struct mutex            mm_lock;
61719 +       int                     nr_pages;
61720 +       struct page             **pages;
61721 +       struct list_head        mappings; /* protected by mm_lock */
61722 +
61723 +       unsigned                evtchn;
61724 +       int                     irq;
61725 +       struct xenfb_page       *page;
61726 +       unsigned long           *mfns;
61727 +       int                     update_wanted; /* XENFB_TYPE_UPDATE wanted */
61728 +
61729 +       struct xenbus_device    *xbdev;
61730 +};
61731 +
61732 +/*
61733 + * How the locks work together
61734 + *
61735 + * There are two locks: spinlock dirty_lock protecting the dirty
61736 + * rectangle, and mutex mm_lock protecting mappings.
61737 + *
61738 + * The problem is that dirty rectangle and mappings aren't
61739 + * independent: the dirty rectangle must cover all faulted pages in
61740 + * mappings.  We need to prove that our locking maintains this
61741 + * invariant.
61742 + *
61743 + * There are several kinds of critical regions:
61744 + *
61745 + * 1. Holding only dirty_lock: xenfb_refresh().  May run in
61746 + *    interrupts.  Extends the dirty rectangle.  Trivially preserves
61747 + *    invariant.
61748 + *
61749 + * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close().  Touch
61750 + *    only mappings.  The former creates unfaulted pages.  Preserves
61751 + *    invariant.  The latter removes pages.  Preserves invariant.
61752 + *
61753 + * 3. Holding both locks: xenfb_vm_nopage().  Extends the dirty
61754 + *    rectangle and updates mappings consistently.  Preserves
61755 + *    invariant.
61756 + *
61757 + * 4. The ugliest one: xenfb_update_screen().  Clear the dirty
61758 + *    rectangle and update mappings consistently.
61759 + *
61760 + *    We can't simply hold both locks, because zap_page_range() cannot
61761 + *    be called with a spinlock held.
61762 + *
61763 + *    Therefore, we first clear the dirty rectangle with both locks
61764 + *    held.  Then we unlock dirty_lock and update the mappings.
61765 + *    Critical regions that hold only dirty_lock may interfere with
61766 + *    that.  This can only be region 1: xenfb_refresh().  But that
61767 + *    just extends the dirty rectangle, which can't harm the
61768 + *    invariant.
61769 + *
61770 + * But FIXME: the invariant is too weak.  It misses that the fault
61771 + * record in mappings must be consistent with the mapping of pages in
61772 + * the associated address space!  do_no_page() updates the PTE after
61773 + * xenfb_vm_nopage() returns, i.e. outside the critical region.  This
61774 + * allows the following race:
61775 + *
61776 + * X writes to some address in the Xen frame buffer
61777 + * Fault - call do_no_page()
61778 + *     call xenfb_vm_nopage()
61779 + *         grab mm_lock
61780 + *         map->faults++;
61781 + *         release mm_lock
61782 + *     return back to do_no_page()
61783 + * (preempted, or SMP)
61784 + * Xen worker thread runs.
61785 + *      grab mm_lock
61786 + *      look at mappings
61787 + *          find this mapping, zaps its pages (but page not in pte yet)
61788 + *          clear map->faults
61789 + *      releases mm_lock
61790 + * (back to X process)
61791 + *     put page in X's pte
61792 + *
61793 + * Oh well, we wont be updating the writes to this page anytime soon.
61794 + */
61795 +
61796 +static int xenfb_fps = 20;
61797 +static unsigned long xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8;
61798 +
61799 +static int xenfb_remove(struct xenbus_device *);
61800 +static void xenfb_init_shared_page(struct xenfb_info *);
61801 +static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
61802 +static void xenfb_disconnect_backend(struct xenfb_info *);
61803 +
61804 +static void xenfb_do_update(struct xenfb_info *info,
61805 +                           int x, int y, int w, int h)
61806 +{
61807 +       union xenfb_out_event event;
61808 +       __u32 prod;
61809 +
61810 +       event.type = XENFB_TYPE_UPDATE;
61811 +       event.update.x = x;
61812 +       event.update.y = y;
61813 +       event.update.width = w;
61814 +       event.update.height = h;
61815 +
61816 +       prod = info->page->out_prod;
61817 +       /* caller ensures !xenfb_queue_full() */
61818 +       mb();                   /* ensure ring space available */
61819 +       XENFB_OUT_RING_REF(info->page, prod) = event;
61820 +       wmb();                  /* ensure ring contents visible */
61821 +       info->page->out_prod = prod + 1;
61822 +
61823 +       notify_remote_via_evtchn(info->evtchn);
61824 +}
61825 +
61826 +static int xenfb_queue_full(struct xenfb_info *info)
61827 +{
61828 +       __u32 cons, prod;
61829 +
61830 +       prod = info->page->out_prod;
61831 +       cons = info->page->out_cons;
61832 +       return prod - cons == XENFB_OUT_RING_LEN;
61833 +}
61834 +
61835 +static void xenfb_update_screen(struct xenfb_info *info)
61836 +{
61837 +       unsigned long flags;
61838 +       int y1, y2, x1, x2;
61839 +       struct xenfb_mapping *map;
61840 +
61841 +       if (!info->update_wanted)
61842 +               return;
61843 +       if (xenfb_queue_full(info))
61844 +               return;
61845 +
61846 +       mutex_lock(&info->mm_lock);
61847 +
61848 +       spin_lock_irqsave(&info->dirty_lock, flags);
61849 +       y1 = info->y1;
61850 +       y2 = info->y2;
61851 +       x1 = info->x1;
61852 +       x2 = info->x2;
61853 +       info->x1 = info->y1 = INT_MAX;
61854 +       info->x2 = info->y2 = 0;
61855 +       spin_unlock_irqrestore(&info->dirty_lock, flags);
61856 +
61857 +       list_for_each_entry(map, &info->mappings, link) {
61858 +               if (!map->faults)
61859 +                       continue;
61860 +               zap_page_range(map->vma, map->vma->vm_start,
61861 +                              map->vma->vm_end - map->vma->vm_start, NULL);
61862 +               map->faults = 0;
61863 +       }
61864 +
61865 +       mutex_unlock(&info->mm_lock);
61866 +
61867 +       xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1);
61868 +}
61869 +
61870 +static int xenfb_thread(void *data)
61871 +{
61872 +       struct xenfb_info *info = data;
61873 +
61874 +       while (!kthread_should_stop()) {
61875 +               if (info->dirty) {
61876 +                       info->dirty = 0;
61877 +                       xenfb_update_screen(info);
61878 +               }
61879 +               wait_event_interruptible(info->wq,
61880 +                       kthread_should_stop() || info->dirty);
61881 +               try_to_freeze();
61882 +       }
61883 +       return 0;
61884 +}
61885 +
61886 +static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green,
61887 +                          unsigned blue, unsigned transp,
61888 +                          struct fb_info *info)
61889 +{
61890 +       u32 v;
61891 +
61892 +       if (regno > info->cmap.len)
61893 +               return 1;
61894 +
61895 +       red   >>= (16 - info->var.red.length);
61896 +       green >>= (16 - info->var.green.length);
61897 +       blue  >>= (16 - info->var.blue.length);
61898 +
61899 +       v = (red << info->var.red.offset) |
61900 +           (green << info->var.green.offset) |
61901 +           (blue << info->var.blue.offset);
61902 +
61903 +       /* FIXME is this sane?  check against xxxfb_setcolreg()!  */
61904 +       switch (info->var.bits_per_pixel) {
61905 +       case 16:
61906 +       case 24:
61907 +       case 32:
61908 +               ((u32 *)info->pseudo_palette)[regno] = v;
61909 +               break;
61910 +       }
61911 +       
61912 +       return 0;
61913 +}
61914 +
61915 +static void xenfb_timer(unsigned long data)
61916 +{
61917 +       struct xenfb_info *info = (struct xenfb_info *)data;
61918 +       info->dirty = 1;
61919 +       wake_up(&info->wq);
61920 +}
61921 +
61922 +static void __xenfb_refresh(struct xenfb_info *info,
61923 +                           int x1, int y1, int w, int h)
61924 +{
61925 +       int y2, x2;
61926 +
61927 +       y2 = y1 + h;
61928 +       x2 = x1 + w;
61929 +
61930 +       if (info->y1 > y1)
61931 +               info->y1 = y1;
61932 +       if (info->y2 < y2)
61933 +               info->y2 = y2;
61934 +       if (info->x1 > x1)
61935 +               info->x1 = x1;
61936 +       if (info->x2 < x2)
61937 +               info->x2 = x2;
61938 +
61939 +       if (timer_pending(&info->refresh))
61940 +               return;
61941 +
61942 +       mod_timer(&info->refresh, jiffies + HZ/xenfb_fps);
61943 +}
61944 +
61945 +static void xenfb_refresh(struct xenfb_info *info,
61946 +                         int x1, int y1, int w, int h)
61947 +{
61948 +       unsigned long flags;
61949 +
61950 +       spin_lock_irqsave(&info->dirty_lock, flags);
61951 +       __xenfb_refresh(info, x1, y1, w, h);
61952 +       spin_unlock_irqrestore(&info->dirty_lock, flags);
61953 +}
61954 +
61955 +static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
61956 +{
61957 +       struct xenfb_info *info = p->par;
61958 +
61959 +       cfb_fillrect(p, rect);
61960 +       xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height);
61961 +}
61962 +
61963 +static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image)
61964 +{
61965 +       struct xenfb_info *info = p->par;
61966 +
61967 +       cfb_imageblit(p, image);
61968 +       xenfb_refresh(info, image->dx, image->dy, image->width, image->height);
61969 +}
61970 +
61971 +static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
61972 +{
61973 +       struct xenfb_info *info = p->par;
61974 +
61975 +       cfb_copyarea(p, area);
61976 +       xenfb_refresh(info, area->dx, area->dy, area->width, area->height);
61977 +}
61978 +
61979 +static void xenfb_vm_open(struct vm_area_struct *vma)
61980 +{
61981 +       struct xenfb_mapping *map = vma->vm_private_data;
61982 +       atomic_inc(&map->map_refs);
61983 +}
61984 +
61985 +static void xenfb_vm_close(struct vm_area_struct *vma)
61986 +{
61987 +       struct xenfb_mapping *map = vma->vm_private_data;
61988 +       struct xenfb_info *info = map->info;
61989 +
61990 +       mutex_lock(&info->mm_lock);
61991 +       if (atomic_dec_and_test(&map->map_refs)) {
61992 +               list_del(&map->link);
61993 +               kfree(map);
61994 +       }
61995 +       mutex_unlock(&info->mm_lock);
61996 +}
61997 +
61998 +static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
61999 +                                   unsigned long vaddr, int *type)
62000 +{
62001 +       struct xenfb_mapping *map = vma->vm_private_data;
62002 +       struct xenfb_info *info = map->info;
62003 +       int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
62004 +       unsigned long flags;
62005 +       struct page *page;
62006 +       int y1, y2;
62007 +
62008 +       if (pgnr >= info->nr_pages)
62009 +               return NOPAGE_SIGBUS;
62010 +
62011 +       mutex_lock(&info->mm_lock);
62012 +       spin_lock_irqsave(&info->dirty_lock, flags);
62013 +       page = info->pages[pgnr];
62014 +       get_page(page);
62015 +       map->faults++;
62016 +
62017 +       y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length;
62018 +       y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length;
62019 +       if (y2 > info->fb_info->var.yres)
62020 +               y2 = info->fb_info->var.yres;
62021 +       __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1);
62022 +       spin_unlock_irqrestore(&info->dirty_lock, flags);
62023 +       mutex_unlock(&info->mm_lock);
62024 +
62025 +       if (type)
62026 +               *type = VM_FAULT_MINOR;
62027 +
62028 +       return page;
62029 +}
62030 +
62031 +static struct vm_operations_struct xenfb_vm_ops = {
62032 +       .open   = xenfb_vm_open,
62033 +       .close  = xenfb_vm_close,
62034 +       .nopage = xenfb_vm_nopage,
62035 +};
62036 +
62037 +static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
62038 +{
62039 +       struct xenfb_info *info = fb_info->par;
62040 +       struct xenfb_mapping *map;
62041 +       int map_pages;
62042 +
62043 +       if (!(vma->vm_flags & VM_WRITE))
62044 +               return -EINVAL;
62045 +       if (!(vma->vm_flags & VM_SHARED))
62046 +               return -EINVAL;
62047 +       if (vma->vm_pgoff != 0)
62048 +               return -EINVAL;
62049 +
62050 +       map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT;
62051 +       if (map_pages > info->nr_pages)
62052 +               return -EINVAL;
62053 +
62054 +       map = kzalloc(sizeof(*map), GFP_KERNEL);
62055 +       if (map == NULL)
62056 +               return -ENOMEM;
62057 +
62058 +       map->vma = vma;
62059 +       map->faults = 0;
62060 +       map->info = info;
62061 +       atomic_set(&map->map_refs, 1);
62062 +
62063 +       mutex_lock(&info->mm_lock);
62064 +       list_add(&map->link, &info->mappings);
62065 +       mutex_unlock(&info->mm_lock);
62066 +
62067 +       vma->vm_ops = &xenfb_vm_ops;
62068 +       vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED);
62069 +       vma->vm_private_data = map;
62070 +
62071 +       return 0;
62072 +}
62073 +
62074 +static struct fb_ops xenfb_fb_ops = {
62075 +       .owner          = THIS_MODULE,
62076 +       .fb_setcolreg   = xenfb_setcolreg,
62077 +       .fb_fillrect    = xenfb_fillrect,
62078 +       .fb_copyarea    = xenfb_copyarea,
62079 +       .fb_imageblit   = xenfb_imageblit,
62080 +       .fb_mmap        = xenfb_mmap,
62081 +};
62082 +
62083 +static irqreturn_t xenfb_event_handler(int rq, void *dev_id,
62084 +                                      struct pt_regs *regs)
62085 +{
62086 +       /*
62087 +        * No in events recognized, simply ignore them all.
62088 +        * If you need to recognize some, see xenbkd's input_handler()
62089 +        * for how to do that.
62090 +        */
62091 +       struct xenfb_info *info = dev_id;
62092 +       struct xenfb_page *page = info->page;
62093 +
62094 +       if (page->in_cons != page->in_prod) {
62095 +               info->page->in_cons = info->page->in_prod;
62096 +               notify_remote_via_evtchn(info->evtchn);
62097 +       }
62098 +       return IRQ_HANDLED;
62099 +}
62100 +
62101 +static unsigned long vmalloc_to_mfn(void *address)
62102 +{
62103 +       return pfn_to_mfn(vmalloc_to_pfn(address));
62104 +}
62105 +
62106 +static int __devinit xenfb_probe(struct xenbus_device *dev,
62107 +                                const struct xenbus_device_id *id)
62108 +{
62109 +       struct xenfb_info *info;
62110 +       struct fb_info *fb_info;
62111 +       int ret;
62112 +
62113 +       info = kzalloc(sizeof(*info), GFP_KERNEL);
62114 +       if (info == NULL) {
62115 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
62116 +               return -ENOMEM;
62117 +       }
62118 +       dev->dev.driver_data = info;
62119 +       info->xbdev = dev;
62120 +       info->irq = -1;
62121 +       info->x1 = info->y1 = INT_MAX;
62122 +       spin_lock_init(&info->dirty_lock);
62123 +       mutex_init(&info->mm_lock);
62124 +       init_waitqueue_head(&info->wq);
62125 +       init_timer(&info->refresh);
62126 +       info->refresh.function = xenfb_timer;
62127 +       info->refresh.data = (unsigned long)info;
62128 +       INIT_LIST_HEAD(&info->mappings);
62129 +
62130 +       info->fb = vmalloc(xenfb_mem_len);
62131 +       if (info->fb == NULL)
62132 +               goto error_nomem;
62133 +       memset(info->fb, 0, xenfb_mem_len);
62134 +
62135 +       info->nr_pages = (xenfb_mem_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
62136 +
62137 +       info->pages = kmalloc(sizeof(struct page *) * info->nr_pages,
62138 +                             GFP_KERNEL);
62139 +       if (info->pages == NULL)
62140 +               goto error_nomem;
62141 +
62142 +       info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
62143 +       if (!info->mfns)
62144 +               goto error_nomem;
62145 +
62146 +       /* set up shared page */
62147 +       info->page = (void *)__get_free_page(GFP_KERNEL);
62148 +       if (!info->page)
62149 +               goto error_nomem;
62150 +
62151 +       xenfb_init_shared_page(info);
62152 +
62153 +       fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL);
62154 +                               /* see fishy hackery below */
62155 +       if (fb_info == NULL)
62156 +               goto error_nomem;
62157 +
62158 +       /* FIXME fishy hackery */
62159 +       fb_info->pseudo_palette = fb_info->par;
62160 +       fb_info->par = info;
62161 +       /* /FIXME */
62162 +       fb_info->screen_base = info->fb;
62163 +
62164 +       fb_info->fbops = &xenfb_fb_ops;
62165 +       fb_info->var.xres_virtual = fb_info->var.xres = info->page->width;
62166 +       fb_info->var.yres_virtual = fb_info->var.yres = info->page->height;
62167 +       fb_info->var.bits_per_pixel = info->page->depth;
62168 +
62169 +       fb_info->var.red = (struct fb_bitfield){16, 8, 0};
62170 +       fb_info->var.green = (struct fb_bitfield){8, 8, 0};
62171 +       fb_info->var.blue = (struct fb_bitfield){0, 8, 0};
62172 +
62173 +       fb_info->var.activate = FB_ACTIVATE_NOW;
62174 +       fb_info->var.height = -1;
62175 +       fb_info->var.width = -1;
62176 +       fb_info->var.vmode = FB_VMODE_NONINTERLACED;
62177 +
62178 +       fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
62179 +       fb_info->fix.line_length = info->page->line_length;
62180 +       fb_info->fix.smem_start = 0;
62181 +       fb_info->fix.smem_len = xenfb_mem_len;
62182 +       strcpy(fb_info->fix.id, "xen");
62183 +       fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
62184 +       fb_info->fix.accel = FB_ACCEL_NONE;
62185 +
62186 +       fb_info->flags = FBINFO_FLAG_DEFAULT;
62187 +
62188 +       ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
62189 +       if (ret < 0) {
62190 +               framebuffer_release(fb_info);
62191 +               xenbus_dev_fatal(dev, ret, "fb_alloc_cmap");
62192 +               goto error;
62193 +       }
62194 +
62195 +       ret = register_framebuffer(fb_info);
62196 +       if (ret) {
62197 +               fb_dealloc_cmap(&info->fb_info->cmap);
62198 +               framebuffer_release(fb_info);
62199 +               xenbus_dev_fatal(dev, ret, "register_framebuffer");
62200 +               goto error;
62201 +       }
62202 +       info->fb_info = fb_info;
62203 +
62204 +       /* FIXME should this be delayed until backend XenbusStateConnected? */
62205 +       info->kthread = kthread_run(xenfb_thread, info, "xenfb thread");
62206 +       if (IS_ERR(info->kthread)) {
62207 +               ret = PTR_ERR(info->kthread);
62208 +               info->kthread = NULL;
62209 +               xenbus_dev_fatal(dev, ret, "register_framebuffer");
62210 +               goto error;
62211 +       }
62212 +
62213 +       ret = xenfb_connect_backend(dev, info);
62214 +       if (ret < 0)
62215 +               goto error;
62216 +
62217 +       return 0;
62218 +
62219 + error_nomem:
62220 +       ret = -ENOMEM;
62221 +       xenbus_dev_fatal(dev, ret, "allocating device memory");
62222 + error:
62223 +       xenfb_remove(dev);
62224 +       return ret;
62225 +}
62226 +
62227 +static int xenfb_resume(struct xenbus_device *dev)
62228 +{
62229 +       struct xenfb_info *info = dev->dev.driver_data;
62230 +
62231 +       xenfb_disconnect_backend(info);
62232 +       xenfb_init_shared_page(info);
62233 +       return xenfb_connect_backend(dev, info);
62234 +}
62235 +
62236 +static int xenfb_remove(struct xenbus_device *dev)
62237 +{
62238 +       struct xenfb_info *info = dev->dev.driver_data;
62239 +
62240 +       del_timer(&info->refresh);
62241 +       if (info->kthread)
62242 +               kthread_stop(info->kthread);
62243 +       xenfb_disconnect_backend(info);
62244 +       if (info->fb_info) {
62245 +               unregister_framebuffer(info->fb_info);
62246 +               fb_dealloc_cmap(&info->fb_info->cmap);
62247 +               framebuffer_release(info->fb_info);
62248 +       }
62249 +       free_page((unsigned long)info->page);
62250 +       vfree(info->mfns);
62251 +       kfree(info->pages);
62252 +       vfree(info->fb);
62253 +       kfree(info);
62254 +
62255 +       return 0;
62256 +}
62257 +
62258 +static void xenfb_init_shared_page(struct xenfb_info *info)
62259 +{
62260 +       int i;
62261 +
62262 +       for (i = 0; i < info->nr_pages; i++)
62263 +               info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE);
62264 +
62265 +       for (i = 0; i < info->nr_pages; i++)
62266 +               info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
62267 +
62268 +       info->page->pd[0] = vmalloc_to_mfn(info->mfns);
62269 +       info->page->pd[1] = 0;
62270 +       info->page->width = XENFB_WIDTH;
62271 +       info->page->height = XENFB_HEIGHT;
62272 +       info->page->depth = XENFB_DEPTH;
62273 +       info->page->line_length = (info->page->depth / 8) * info->page->width;
62274 +       info->page->mem_length = xenfb_mem_len;
62275 +       info->page->in_cons = info->page->in_prod = 0;
62276 +       info->page->out_cons = info->page->out_prod = 0;
62277 +}
62278 +
62279 +static int xenfb_connect_backend(struct xenbus_device *dev,
62280 +                                struct xenfb_info *info)
62281 +{
62282 +       int ret;
62283 +       struct xenbus_transaction xbt;
62284 +
62285 +       ret = xenbus_alloc_evtchn(dev, &info->evtchn);
62286 +       if (ret)
62287 +               return ret;
62288 +       ret = bind_evtchn_to_irqhandler(info->evtchn, xenfb_event_handler,
62289 +                                       0, "xenfb", info);
62290 +       if (ret < 0) {
62291 +               xenbus_free_evtchn(dev, info->evtchn);
62292 +               xenbus_dev_fatal(dev, ret, "bind_evtchn_to_irqhandler");
62293 +               return ret;
62294 +       }
62295 +       info->irq = ret;
62296 +
62297 + again:
62298 +       ret = xenbus_transaction_start(&xbt);
62299 +       if (ret) {
62300 +               xenbus_dev_fatal(dev, ret, "starting transaction");
62301 +               return ret;
62302 +       }
62303 +       ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
62304 +                           virt_to_mfn(info->page));
62305 +       if (ret)
62306 +               goto error_xenbus;
62307 +       ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
62308 +                           info->evtchn);
62309 +       if (ret)
62310 +               goto error_xenbus;
62311 +       ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1");
62312 +       if (ret)
62313 +               goto error_xenbus;
62314 +       ret = xenbus_transaction_end(xbt, 0);
62315 +       if (ret) {
62316 +               if (ret == -EAGAIN)
62317 +                       goto again;
62318 +               xenbus_dev_fatal(dev, ret, "completing transaction");
62319 +               return ret;
62320 +       }
62321 +
62322 +       xenbus_switch_state(dev, XenbusStateInitialised);
62323 +       return 0;
62324 +
62325 + error_xenbus:
62326 +       xenbus_transaction_end(xbt, 1);
62327 +       xenbus_dev_fatal(dev, ret, "writing xenstore");
62328 +       return ret;
62329 +}
62330 +
62331 +static void xenfb_disconnect_backend(struct xenfb_info *info)
62332 +{
62333 +       if (info->irq >= 0)
62334 +               unbind_from_irqhandler(info->irq, info);
62335 +       info->irq = -1;
62336 +}
62337 +
62338 +static void xenfb_backend_changed(struct xenbus_device *dev,
62339 +                                 enum xenbus_state backend_state)
62340 +{
62341 +       struct xenfb_info *info = dev->dev.driver_data;
62342 +       int val;
62343 +
62344 +       switch (backend_state) {
62345 +       case XenbusStateInitialising:
62346 +       case XenbusStateInitialised:
62347 +       case XenbusStateUnknown:
62348 +       case XenbusStateClosed:
62349 +               break;
62350 +
62351 +       case XenbusStateInitWait:
62352 +       InitWait:
62353 +               xenbus_switch_state(dev, XenbusStateConnected);
62354 +               break;
62355 +
62356 +       case XenbusStateConnected:
62357 +               /*
62358 +                * Work around xenbus race condition: If backend goes
62359 +                * through InitWait to Connected fast enough, we can
62360 +                * get Connected twice here.
62361 +                */
62362 +               if (dev->state != XenbusStateConnected)
62363 +                       goto InitWait; /* no InitWait seen yet, fudge it */
62364 +
62365 +               if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
62366 +                                "request-update", "%d", &val) < 0)
62367 +                       val = 0;
62368 +               if (val)
62369 +                       info->update_wanted = 1;
62370 +               break;
62371 +
62372 +       case XenbusStateClosing:
62373 +               // FIXME is this safe in any dev->state?
62374 +               xenbus_frontend_closed(dev);
62375 +               break;
62376 +       }
62377 +}
62378 +
62379 +static struct xenbus_device_id xenfb_ids[] = {
62380 +       { "vfb" },
62381 +       { "" }
62382 +};
62383 +
62384 +static struct xenbus_driver xenfb = {
62385 +       .name = "vfb",
62386 +       .owner = THIS_MODULE,
62387 +       .ids = xenfb_ids,
62388 +       .probe = xenfb_probe,
62389 +       .remove = xenfb_remove,
62390 +       .resume = xenfb_resume,
62391 +       .otherend_changed = xenfb_backend_changed,
62392 +};
62393 +
62394 +static int __init xenfb_init(void)
62395 +{
62396 +       if (!is_running_on_xen())
62397 +               return -ENODEV;
62398 +
62399 +       /* Nothing to do if running in dom0. */
62400 +       if (is_initial_xendomain())
62401 +               return -ENODEV;
62402 +
62403 +       return xenbus_register_frontend(&xenfb);
62404 +}
62405 +
62406 +static void __exit xenfb_cleanup(void)
62407 +{
62408 +       return xenbus_unregister_driver(&xenfb);
62409 +}
62410 +
62411 +module_init(xenfb_init);
62412 +module_exit(xenfb_cleanup);
62413 +
62414 +MODULE_LICENSE("GPL");
62415 diff -ruNp linux-2.6.19/drivers/xen/fbfront/xenkbd.c linux-2.6.19-xen-3.0.4/drivers/xen/fbfront/xenkbd.c
62416 --- linux-2.6.19/drivers/xen/fbfront/xenkbd.c   1970-01-01 00:00:00.000000000 +0000
62417 +++ linux-2.6.19-xen-3.0.4/drivers/xen/fbfront/xenkbd.c 2007-02-02 19:10:45.000000000 +0000
62418 @@ -0,0 +1,300 @@
62419 +/*
62420 + * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device
62421 + *
62422 + * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
62423 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
62424 + *
62425 + *  Based on linux/drivers/input/mouse/sermouse.c
62426 + *
62427 + *  This file is subject to the terms and conditions of the GNU General Public
62428 + *  License. See the file COPYING in the main directory of this archive for
62429 + *  more details.
62430 + */
62431 +
62432 +/*
62433 + * TODO:
62434 + *
62435 + * Switch to grant tables together with xenfb.c.
62436 + */
62437 +
62438 +#include <linux/kernel.h>
62439 +#include <linux/errno.h>
62440 +#include <linux/module.h>
62441 +#include <linux/input.h>
62442 +#include <asm/hypervisor.h>
62443 +#include <xen/evtchn.h>
62444 +#include <xen/interface/io/fbif.h>
62445 +#include <xen/interface/io/kbdif.h>
62446 +#include <xen/xenbus.h>
62447 +
62448 +struct xenkbd_info
62449 +{
62450 +       struct input_dev *dev;
62451 +       struct xenkbd_page *page;
62452 +       unsigned evtchn;
62453 +       int irq;
62454 +       struct xenbus_device *xbdev;
62455 +};
62456 +
62457 +static int xenkbd_remove(struct xenbus_device *);
62458 +static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *);
62459 +static void xenkbd_disconnect_backend(struct xenkbd_info *);
62460 +
62461 +/*
62462 + * Note: if you need to send out events, see xenfb_do_update() for how
62463 + * to do that.
62464 + */
62465 +
62466 +static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs)
62467 +{
62468 +       struct xenkbd_info *info = dev_id;
62469 +       struct xenkbd_page *page = info->page;
62470 +       __u32 cons, prod;
62471 +
62472 +       prod = page->in_prod;
62473 +       if (prod == page->out_cons)
62474 +               return IRQ_HANDLED;
62475 +       rmb();                  /* ensure we see ring contents up to prod */
62476 +       for (cons = page->in_cons; cons != prod; cons++) {
62477 +               union xenkbd_in_event *event;
62478 +               event = &XENKBD_IN_RING_REF(page, cons);
62479 +
62480 +               switch (event->type) {
62481 +               case XENKBD_TYPE_MOTION:
62482 +                       input_report_rel(info->dev, REL_X, event->motion.rel_x);
62483 +                       input_report_rel(info->dev, REL_Y, event->motion.rel_y);
62484 +                       break;
62485 +               case XENKBD_TYPE_KEY:
62486 +                       input_report_key(info->dev, event->key.keycode, event->key.pressed);
62487 +                       break;
62488 +               case XENKBD_TYPE_POS:
62489 +                       input_report_abs(info->dev, ABS_X, event->pos.abs_x);
62490 +                       input_report_abs(info->dev, ABS_Y, event->pos.abs_y);
62491 +                       break;
62492 +               }
62493 +       }
62494 +       input_sync(info->dev);
62495 +       mb();                   /* ensure we got ring contents */
62496 +       page->in_cons = cons;
62497 +       notify_remote_via_evtchn(info->evtchn);
62498 +
62499 +       return IRQ_HANDLED;
62500 +}
62501 +
62502 +int __devinit xenkbd_probe(struct xenbus_device *dev,
62503 +                          const struct xenbus_device_id *id)
62504 +{
62505 +       int ret, i;
62506 +       struct xenkbd_info *info;
62507 +       struct input_dev *input_dev;
62508 +
62509 +       info = kzalloc(sizeof(*info), GFP_KERNEL);
62510 +       if (!info) {
62511 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
62512 +               return -ENOMEM;
62513 +       }
62514 +       dev->dev.driver_data = info;
62515 +       info->xbdev = dev;
62516 +
62517 +       info->page = (void *)__get_free_page(GFP_KERNEL);
62518 +       if (!info->page)
62519 +               goto error_nomem;
62520 +       info->page->in_cons = info->page->in_prod = 0;
62521 +       info->page->out_cons = info->page->out_prod = 0;
62522 +
62523 +       input_dev = input_allocate_device();
62524 +       if (!input_dev)
62525 +               goto error_nomem;
62526 +
62527 +       input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS);
62528 +       input_dev->keybit[LONG(BTN_MOUSE)]
62529 +               = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT);
62530 +       /* TODO additional buttons */
62531 +       input_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y);
62532 +
62533 +       /* FIXME not sure this is quite right */
62534 +       for (i = 0; i < 256; i++)
62535 +               set_bit(i, input_dev->keybit);
62536 +
62537 +       input_dev->name = "Xen Virtual Keyboard/Mouse";
62538 +
62539 +       input_set_abs_params(input_dev, ABS_X, 0, XENFB_WIDTH, 0, 0);
62540 +       input_set_abs_params(input_dev, ABS_Y, 0, XENFB_HEIGHT, 0, 0);
62541 +
62542 +       ret = input_register_device(input_dev);
62543 +       if (ret) {
62544 +               input_free_device(input_dev);
62545 +               xenbus_dev_fatal(dev, ret, "input_register_device");
62546 +               goto error;
62547 +       }
62548 +       info->dev = input_dev;
62549 +
62550 +       ret = xenkbd_connect_backend(dev, info);
62551 +       if (ret < 0)
62552 +               goto error;
62553 +
62554 +       return 0;
62555 +
62556 + error_nomem:
62557 +       ret = -ENOMEM;
62558 +       xenbus_dev_fatal(dev, ret, "allocating device memory");
62559 + error:
62560 +       xenkbd_remove(dev);
62561 +       return ret;
62562 +}
62563 +
62564 +static int xenkbd_resume(struct xenbus_device *dev)
62565 +{
62566 +       struct xenkbd_info *info = dev->dev.driver_data;
62567 +
62568 +       xenkbd_disconnect_backend(info);
62569 +       return xenkbd_connect_backend(dev, info);
62570 +}
62571 +
62572 +static int xenkbd_remove(struct xenbus_device *dev)
62573 +{
62574 +       struct xenkbd_info *info = dev->dev.driver_data;
62575 +
62576 +       xenkbd_disconnect_backend(info);
62577 +       input_unregister_device(info->dev);
62578 +       free_page((unsigned long)info->page);
62579 +       kfree(info);
62580 +       return 0;
62581 +}
62582 +
62583 +static int xenkbd_connect_backend(struct xenbus_device *dev,
62584 +                                 struct xenkbd_info *info)
62585 +{
62586 +       int ret;
62587 +       struct xenbus_transaction xbt;
62588 +
62589 +       ret = xenbus_alloc_evtchn(dev, &info->evtchn);
62590 +       if (ret)
62591 +               return ret;
62592 +       ret = bind_evtchn_to_irqhandler(info->evtchn, input_handler, 0,
62593 +                                       "xenkbd", info);
62594 +       if (ret < 0) {
62595 +               xenbus_free_evtchn(dev, info->evtchn);
62596 +               xenbus_dev_fatal(dev, ret, "bind_evtchn_to_irqhandler");
62597 +               return ret;
62598 +       }
62599 +       info->irq = ret;
62600 +
62601 + again:
62602 +       ret = xenbus_transaction_start(&xbt);
62603 +       if (ret) {
62604 +               xenbus_dev_fatal(dev, ret, "starting transaction");
62605 +               return ret;
62606 +       }
62607 +       ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
62608 +                           virt_to_mfn(info->page));
62609 +       if (ret)
62610 +               goto error_xenbus;
62611 +       ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
62612 +                           info->evtchn);
62613 +       if (ret)
62614 +               goto error_xenbus;
62615 +       ret = xenbus_transaction_end(xbt, 0);
62616 +       if (ret) {
62617 +               if (ret == -EAGAIN)
62618 +                       goto again;
62619 +               xenbus_dev_fatal(dev, ret, "completing transaction");
62620 +               return ret;
62621 +       }
62622 +
62623 +       xenbus_switch_state(dev, XenbusStateInitialised);
62624 +       return 0;
62625 +
62626 + error_xenbus:
62627 +       xenbus_transaction_end(xbt, 1);
62628 +       xenbus_dev_fatal(dev, ret, "writing xenstore");
62629 +       return ret;
62630 +}
62631 +
62632 +static void xenkbd_disconnect_backend(struct xenkbd_info *info)
62633 +{
62634 +       if (info->irq >= 0)
62635 +               unbind_from_irqhandler(info->irq, info);
62636 +       info->irq = -1;
62637 +}
62638 +
62639 +static void xenkbd_backend_changed(struct xenbus_device *dev,
62640 +                                  enum xenbus_state backend_state)
62641 +{
62642 +       struct xenkbd_info *info = dev->dev.driver_data;
62643 +       int ret, val;
62644 +
62645 +       switch (backend_state) {
62646 +       case XenbusStateInitialising:
62647 +       case XenbusStateInitialised:
62648 +       case XenbusStateUnknown:
62649 +       case XenbusStateClosed:
62650 +               break;
62651 +
62652 +       case XenbusStateInitWait:
62653 +       InitWait:
62654 +               ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
62655 +                                  "feature-abs-pointer", "%d", &val);
62656 +               if (ret < 0)
62657 +                       val = 0;
62658 +               if (val) {
62659 +                       ret = xenbus_printf(XBT_NIL, info->xbdev->nodename,
62660 +                                           "request-abs-pointer", "1");
62661 +                       if (ret)
62662 +                               ; /* FIXME */
62663 +               }
62664 +               xenbus_switch_state(dev, XenbusStateConnected);
62665 +               break;
62666 +
62667 +       case XenbusStateConnected:
62668 +               /*
62669 +                * Work around xenbus race condition: If backend goes
62670 +                * through InitWait to Connected fast enough, we can
62671 +                * get Connected twice here.
62672 +                */
62673 +               if (dev->state != XenbusStateConnected)
62674 +                       goto InitWait; /* no InitWait seen yet, fudge it */
62675 +               break;
62676 +
62677 +       case XenbusStateClosing:
62678 +               xenbus_frontend_closed(dev);
62679 +               break;
62680 +       }
62681 +}
62682 +
62683 +static struct xenbus_device_id xenkbd_ids[] = {
62684 +       { "vkbd" },
62685 +       { "" }
62686 +};
62687 +
62688 +static struct xenbus_driver xenkbd = {
62689 +       .name = "vkbd",
62690 +       .owner = THIS_MODULE,
62691 +       .ids = xenkbd_ids,
62692 +       .probe = xenkbd_probe,
62693 +       .remove = xenkbd_remove,
62694 +       .resume = xenkbd_resume,
62695 +       .otherend_changed = xenkbd_backend_changed,
62696 +};
62697 +
62698 +static int __init xenkbd_init(void)
62699 +{
62700 +       if (!is_running_on_xen())
62701 +               return -ENODEV;
62702 +
62703 +       /* Nothing to do if running in dom0. */
62704 +       if (is_initial_xendomain())
62705 +               return -ENODEV;
62706 +
62707 +       return xenbus_register_frontend(&xenkbd);
62708 +}
62709 +
62710 +static void __exit xenkbd_cleanup(void)
62711 +{
62712 +       return xenbus_unregister_driver(&xenkbd);
62713 +}
62714 +
62715 +module_init(xenkbd_init);
62716 +module_exit(xenkbd_cleanup);
62717 +
62718 +MODULE_LICENSE("GPL");
62719 diff -ruNp linux-2.6.19/drivers/xen/netback/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/netback/Makefile
62720 --- linux-2.6.19/drivers/xen/netback/Makefile   1970-01-01 00:00:00.000000000 +0000
62721 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netback/Makefile 2007-02-02 19:10:45.000000000 +0000
62722 @@ -0,0 +1,5 @@
62723 +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
62724 +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
62725 +
62726 +netbk-y   := netback.o xenbus.o interface.o
62727 +netloop-y := loopback.o
62728 diff -ruNp linux-2.6.19/drivers/xen/netback/common.h linux-2.6.19-xen-3.0.4/drivers/xen/netback/common.h
62729 --- linux-2.6.19/drivers/xen/netback/common.h   1970-01-01 00:00:00.000000000 +0000
62730 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netback/common.h 2007-02-02 19:10:45.000000000 +0000
62731 @@ -0,0 +1,146 @@
62732 +/******************************************************************************
62733 + * arch/xen/drivers/netif/backend/common.h
62734 + * 
62735 + * This program is free software; you can redistribute it and/or
62736 + * modify it under the terms of the GNU General Public License version 2
62737 + * as published by the Free Software Foundation; or, when distributed
62738 + * separately from the Linux kernel or incorporated into other
62739 + * software packages, subject to the following license:
62740 + * 
62741 + * Permission is hereby granted, free of charge, to any person obtaining a copy
62742 + * of this source file (the "Software"), to deal in the Software without
62743 + * restriction, including without limitation the rights to use, copy, modify,
62744 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62745 + * and to permit persons to whom the Software is furnished to do so, subject to
62746 + * the following conditions:
62747 + * 
62748 + * The above copyright notice and this permission notice shall be included in
62749 + * all copies or substantial portions of the Software.
62750 + * 
62751 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62752 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62753 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62754 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62755 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62756 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62757 + * IN THE SOFTWARE.
62758 + */
62759 +
62760 +#ifndef __NETIF__BACKEND__COMMON_H__
62761 +#define __NETIF__BACKEND__COMMON_H__
62762 +
62763 +#include <linux/version.h>
62764 +#include <linux/module.h>
62765 +#include <linux/interrupt.h>
62766 +#include <linux/slab.h>
62767 +#include <linux/ip.h>
62768 +#include <linux/in.h>
62769 +#include <linux/netdevice.h>
62770 +#include <linux/etherdevice.h>
62771 +#include <linux/wait.h>
62772 +#include <xen/evtchn.h>
62773 +#include <xen/interface/io/netif.h>
62774 +#include <asm/io.h>
62775 +#include <asm/pgalloc.h>
62776 +#include <xen/interface/grant_table.h>
62777 +#include <xen/gnttab.h>
62778 +#include <xen/driver_util.h>
62779 +#include <asm/hypercall.h>
62780 +
62781 +#define DPRINTK(_f, _a...)                     \
62782 +       pr_debug("(file=%s, line=%d) " _f,      \
62783 +                __FILE__ , __LINE__ , ## _a )
62784 +#define IPRINTK(fmt, args...)                          \
62785 +       printk(KERN_INFO "xen_net: " fmt, ##args)
62786 +#define WPRINTK(fmt, args...)                          \
62787 +       printk(KERN_WARNING "xen_net: " fmt, ##args)
62788 +
62789 +typedef struct netif_st {
62790 +       /* Unique identifier for this interface. */
62791 +       domid_t          domid;
62792 +       unsigned int     handle;
62793 +
62794 +       u8               fe_dev_addr[6];
62795 +
62796 +       /* Physical parameters of the comms window. */
62797 +       grant_handle_t   tx_shmem_handle;
62798 +       grant_ref_t      tx_shmem_ref;
62799 +       grant_handle_t   rx_shmem_handle;
62800 +       grant_ref_t      rx_shmem_ref;
62801 +       unsigned int     evtchn;
62802 +       unsigned int     irq;
62803 +
62804 +       /* The shared rings and indexes. */
62805 +       netif_tx_back_ring_t tx;
62806 +       netif_rx_back_ring_t rx;
62807 +       struct vm_struct *tx_comms_area;
62808 +       struct vm_struct *rx_comms_area;
62809 +
62810 +       /* Set of features that can be turned on in dev->features. */
62811 +       int features;
62812 +
62813 +       /* Internal feature information. */
62814 +       int can_queue:1;        /* can queue packets for receiver? */
62815 +       int copying_receiver:1; /* copy packets to receiver?       */
62816 +
62817 +       /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
62818 +       RING_IDX rx_req_cons_peek;
62819 +
62820 +       /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
62821 +       unsigned long   credit_bytes;
62822 +       unsigned long   credit_usec;
62823 +       unsigned long   remaining_credit;
62824 +       struct timer_list credit_timeout;
62825 +
62826 +       /* Enforce draining of the transmit queue. */
62827 +       struct timer_list tx_queue_timeout;
62828 +
62829 +       /* Miscellaneous private stuff. */
62830 +       struct list_head list;  /* scheduling list */
62831 +       atomic_t         refcnt;
62832 +       struct net_device *dev;
62833 +       struct net_device_stats stats;
62834 +
62835 +       wait_queue_head_t waiting_to_free;
62836 +} netif_t;
62837 +
62838 +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
62839 +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
62840 +
62841 +void netif_disconnect(netif_t *netif);
62842 +
62843 +netif_t *netif_alloc(domid_t domid, unsigned int handle);
62844 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
62845 +             unsigned long rx_ring_ref, unsigned int evtchn);
62846 +
62847 +#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
62848 +#define netif_put(_b)                                          \
62849 +       do {                                                    \
62850 +               if ( atomic_dec_and_test(&(_b)->refcnt) )       \
62851 +                       wake_up(&(_b)->waiting_to_free);        \
62852 +       } while (0)
62853 +
62854 +void netif_xenbus_init(void);
62855 +
62856 +#define netif_schedulable(dev) (netif_running(dev) && netif_carrier_ok(dev))
62857 +
62858 +void netif_schedule_work(netif_t *netif);
62859 +void netif_deschedule_work(netif_t *netif);
62860 +
62861 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
62862 +struct net_device_stats *netif_be_get_stats(struct net_device *dev);
62863 +irqreturn_t netif_be_int(int irq, void *dev_id);
62864 +
62865 +static inline int netbk_can_queue(struct net_device *dev)
62866 +{
62867 +       netif_t *netif = netdev_priv(dev);
62868 +       return netif->can_queue;
62869 +}
62870 +
62871 +static inline int netbk_can_sg(struct net_device *dev)
62872 +{
62873 +       netif_t *netif = netdev_priv(dev);
62874 +       return netif->features & NETIF_F_SG;
62875 +}
62876 +
62877 +#endif /* __NETIF__BACKEND__COMMON_H__ */
62878 diff -ruNp linux-2.6.19/drivers/xen/netback/interface.c linux-2.6.19-xen-3.0.4/drivers/xen/netback/interface.c
62879 --- linux-2.6.19/drivers/xen/netback/interface.c        1970-01-01 00:00:00.000000000 +0000
62880 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netback/interface.c      2007-02-02 19:10:45.000000000 +0000
62881 @@ -0,0 +1,349 @@
62882 +/******************************************************************************
62883 + * arch/xen/drivers/netif/backend/interface.c
62884 + * 
62885 + * Network-device interface management.
62886 + * 
62887 + * Copyright (c) 2004-2005, Keir Fraser
62888 + * 
62889 + * This program is free software; you can redistribute it and/or
62890 + * modify it under the terms of the GNU General Public License version 2
62891 + * as published by the Free Software Foundation; or, when distributed
62892 + * separately from the Linux kernel or incorporated into other
62893 + * software packages, subject to the following license:
62894 + * 
62895 + * Permission is hereby granted, free of charge, to any person obtaining a copy
62896 + * of this source file (the "Software"), to deal in the Software without
62897 + * restriction, including without limitation the rights to use, copy, modify,
62898 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62899 + * and to permit persons to whom the Software is furnished to do so, subject to
62900 + * the following conditions:
62901 + * 
62902 + * The above copyright notice and this permission notice shall be included in
62903 + * all copies or substantial portions of the Software.
62904 + * 
62905 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62906 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62907 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62908 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62909 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62910 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62911 + * IN THE SOFTWARE.
62912 + */
62913 +
62914 +#include "common.h"
62915 +#include <linux/ethtool.h>
62916 +#include <linux/rtnetlink.h>
62917 +
62918 +/*
62919 + * Module parameter 'queue_length':
62920 + * 
62921 + * Enables queuing in the network stack when a client has run out of receive
62922 + * descriptors. Although this feature can improve receive bandwidth by avoiding
62923 + * packet loss, it can also result in packets sitting in the 'tx_queue' for
62924 + * unbounded time. This is bad if those packets hold onto foreign resources.
62925 + * For example, consider a packet that holds onto resources belonging to the
62926 + * guest for which it is queued (e.g., packet received on vif1.0, destined for
62927 + * vif1.1 which is not activated in the guest): in this situation the guest
62928 + * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
62929 + * run a timer (tx_queue_timeout) to drain the queue when the interface is
62930 + * blocked.
62931 + */
62932 +static unsigned long netbk_queue_length = 32;
62933 +module_param_named(queue_length, netbk_queue_length, ulong, 0);
62934 +
62935 +static void __netif_up(netif_t *netif)
62936 +{
62937 +       enable_irq(netif->irq);
62938 +       netif_schedule_work(netif);
62939 +}
62940 +
62941 +static void __netif_down(netif_t *netif)
62942 +{
62943 +       disable_irq(netif->irq);
62944 +       netif_deschedule_work(netif);
62945 +}
62946 +
62947 +static int net_open(struct net_device *dev)
62948 +{
62949 +       netif_t *netif = netdev_priv(dev);
62950 +       if (netif_carrier_ok(dev))
62951 +               __netif_up(netif);
62952 +       return 0;
62953 +}
62954 +
62955 +static int net_close(struct net_device *dev)
62956 +{
62957 +       netif_t *netif = netdev_priv(dev);
62958 +       if (netif_carrier_ok(dev))
62959 +               __netif_down(netif);
62960 +       return 0;
62961 +}
62962 +
62963 +static int netbk_change_mtu(struct net_device *dev, int mtu)
62964 +{
62965 +       int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
62966 +
62967 +       if (mtu > max)
62968 +               return -EINVAL;
62969 +       dev->mtu = mtu;
62970 +       return 0;
62971 +}
62972 +
62973 +static int netbk_set_sg(struct net_device *dev, u32 data)
62974 +{
62975 +       if (data) {
62976 +               netif_t *netif = netdev_priv(dev);
62977 +
62978 +               if (!(netif->features & NETIF_F_SG))
62979 +                       return -ENOSYS;
62980 +       }
62981 +
62982 +       return ethtool_op_set_sg(dev, data);
62983 +}
62984 +
62985 +static int netbk_set_tso(struct net_device *dev, u32 data)
62986 +{
62987 +       if (data) {
62988 +               netif_t *netif = netdev_priv(dev);
62989 +
62990 +               if (!(netif->features & NETIF_F_TSO))
62991 +                       return -ENOSYS;
62992 +       }
62993 +
62994 +       return ethtool_op_set_tso(dev, data);
62995 +}
62996 +
62997 +static struct ethtool_ops network_ethtool_ops =
62998 +{
62999 +       .get_tx_csum = ethtool_op_get_tx_csum,
63000 +       .set_tx_csum = ethtool_op_set_tx_csum,
63001 +       .get_sg = ethtool_op_get_sg,
63002 +       .set_sg = netbk_set_sg,
63003 +       .get_tso = ethtool_op_get_tso,
63004 +       .set_tso = netbk_set_tso,
63005 +       .get_link = ethtool_op_get_link,
63006 +};
63007 +
63008 +netif_t *netif_alloc(domid_t domid, unsigned int handle)
63009 +{
63010 +       int err = 0;
63011 +       struct net_device *dev;
63012 +       netif_t *netif;
63013 +       char name[IFNAMSIZ] = {};
63014 +
63015 +       snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
63016 +       dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
63017 +       if (dev == NULL) {
63018 +               DPRINTK("Could not create netif: out of memory\n");
63019 +               return ERR_PTR(-ENOMEM);
63020 +       }
63021 +
63022 +       netif_carrier_off(dev);
63023 +
63024 +       netif = netdev_priv(dev);
63025 +       memset(netif, 0, sizeof(*netif));
63026 +       netif->domid  = domid;
63027 +       netif->handle = handle;
63028 +       atomic_set(&netif->refcnt, 1);
63029 +       init_waitqueue_head(&netif->waiting_to_free);
63030 +       netif->dev = dev;
63031 +
63032 +       netif->credit_bytes = netif->remaining_credit = ~0UL;
63033 +       netif->credit_usec  = 0UL;
63034 +       init_timer(&netif->credit_timeout);
63035 +       /* Initialize 'expires' now: it's used to track the credit window. */
63036 +       netif->credit_timeout.expires = jiffies;
63037 +
63038 +       init_timer(&netif->tx_queue_timeout);
63039 +
63040 +       dev->hard_start_xmit = netif_be_start_xmit;
63041 +       dev->get_stats       = netif_be_get_stats;
63042 +       dev->open            = net_open;
63043 +       dev->stop            = net_close;
63044 +       dev->change_mtu      = netbk_change_mtu;
63045 +       dev->features        = NETIF_F_IP_CSUM;
63046 +
63047 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
63048 +
63049 +       dev->tx_queue_len = netbk_queue_length;
63050 +
63051 +       /*
63052 +        * Initialise a dummy MAC address. We choose the numerically
63053 +        * largest non-broadcast address to prevent the address getting
63054 +        * stolen by an Ethernet bridge for STP purposes.
63055 +        * (FE:FF:FF:FF:FF:FF)
63056 +        */ 
63057 +       memset(dev->dev_addr, 0xFF, ETH_ALEN);
63058 +       dev->dev_addr[0] &= ~0x01;
63059 +
63060 +       rtnl_lock();
63061 +       err = register_netdevice(dev);
63062 +       rtnl_unlock();
63063 +       if (err) {
63064 +               DPRINTK("Could not register new net device %s: err=%d\n",
63065 +                       dev->name, err);
63066 +               free_netdev(dev);
63067 +               return ERR_PTR(err);
63068 +       }
63069 +
63070 +       DPRINTK("Successfully created netif\n");
63071 +       return netif;
63072 +}
63073 +
63074 +static int map_frontend_pages(
63075 +       netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
63076 +{
63077 +       struct gnttab_map_grant_ref op;
63078 +       int ret;
63079 +
63080 +       gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
63081 +                         GNTMAP_host_map, tx_ring_ref, netif->domid);
63082 +    
63083 +       lock_vm_area(netif->tx_comms_area);
63084 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
63085 +       unlock_vm_area(netif->tx_comms_area);
63086 +       BUG_ON(ret);
63087 +
63088 +       if (op.status) { 
63089 +               DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
63090 +               return op.status;
63091 +       }
63092 +
63093 +       netif->tx_shmem_ref    = tx_ring_ref;
63094 +       netif->tx_shmem_handle = op.handle;
63095 +
63096 +       gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
63097 +                         GNTMAP_host_map, rx_ring_ref, netif->domid);
63098 +
63099 +       lock_vm_area(netif->rx_comms_area);
63100 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
63101 +       unlock_vm_area(netif->rx_comms_area);
63102 +       BUG_ON(ret);
63103 +
63104 +       if (op.status) {
63105 +               DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
63106 +               return op.status;
63107 +       }
63108 +
63109 +       netif->rx_shmem_ref    = rx_ring_ref;
63110 +       netif->rx_shmem_handle = op.handle;
63111 +
63112 +       return 0;
63113 +}
63114 +
63115 +static void unmap_frontend_pages(netif_t *netif)
63116 +{
63117 +       struct gnttab_unmap_grant_ref op;
63118 +       int ret;
63119 +
63120 +       gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
63121 +                           GNTMAP_host_map, netif->tx_shmem_handle);
63122 +
63123 +       lock_vm_area(netif->tx_comms_area);
63124 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
63125 +       unlock_vm_area(netif->tx_comms_area);
63126 +       BUG_ON(ret);
63127 +
63128 +       gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
63129 +                           GNTMAP_host_map, netif->rx_shmem_handle);
63130 +
63131 +       lock_vm_area(netif->rx_comms_area);
63132 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
63133 +       unlock_vm_area(netif->rx_comms_area);
63134 +       BUG_ON(ret);
63135 +}
63136 +
63137 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
63138 +             unsigned long rx_ring_ref, unsigned int evtchn)
63139 +{
63140 +       int err = -ENOMEM;
63141 +       netif_tx_sring_t *txs;
63142 +       netif_rx_sring_t *rxs;
63143 +       struct evtchn_bind_interdomain bind_interdomain;
63144 +
63145 +       /* Already connected through? */
63146 +       if (netif->irq)
63147 +               return 0;
63148 +
63149 +       netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
63150 +       if (netif->tx_comms_area == NULL)
63151 +               return -ENOMEM;
63152 +       netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
63153 +       if (netif->rx_comms_area == NULL)
63154 +               goto err_rx;
63155 +
63156 +       err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
63157 +       if (err)
63158 +               goto err_map;
63159 +
63160 +       bind_interdomain.remote_dom = netif->domid;
63161 +       bind_interdomain.remote_port = evtchn;
63162 +
63163 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
63164 +                                         &bind_interdomain);
63165 +       if (err)
63166 +               goto err_hypervisor;
63167 +
63168 +       netif->evtchn = bind_interdomain.local_port;
63169 +
63170 +       netif->irq = bind_evtchn_to_irqhandler(
63171 +               netif->evtchn, netif_be_int, 0, netif->dev->name, netif);
63172 +       disable_irq(netif->irq);
63173 +
63174 +       txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
63175 +       BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
63176 +
63177 +       rxs = (netif_rx_sring_t *)
63178 +               ((char *)netif->rx_comms_area->addr);
63179 +       BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
63180 +
63181 +       netif->rx_req_cons_peek = 0;
63182 +
63183 +       netif_get(netif);
63184 +
63185 +       rtnl_lock();
63186 +       netif_carrier_on(netif->dev);
63187 +       if (netif_running(netif->dev))
63188 +               __netif_up(netif);
63189 +       rtnl_unlock();
63190 +
63191 +       return 0;
63192 +err_hypervisor:
63193 +       unmap_frontend_pages(netif);
63194 +err_map:
63195 +       free_vm_area(netif->rx_comms_area);
63196 +err_rx:
63197 +       free_vm_area(netif->tx_comms_area);
63198 +       return err;
63199 +}
63200 +
63201 +void netif_disconnect(netif_t *netif)
63202 +{
63203 +       if (netif_carrier_ok(netif->dev)) {
63204 +               rtnl_lock();
63205 +               netif_carrier_off(netif->dev);
63206 +               if (netif_running(netif->dev))
63207 +                       __netif_down(netif);
63208 +               rtnl_unlock();
63209 +               netif_put(netif);
63210 +       }
63211 +
63212 +       atomic_dec(&netif->refcnt);
63213 +       wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
63214 +
63215 +       del_timer_sync(&netif->credit_timeout);
63216 +       del_timer_sync(&netif->tx_queue_timeout);
63217 +
63218 +       if (netif->irq)
63219 +               unbind_from_irqhandler(netif->irq, netif);
63220 +       
63221 +       unregister_netdev(netif->dev);
63222 +
63223 +       if (netif->tx.sring) {
63224 +               unmap_frontend_pages(netif);
63225 +               free_vm_area(netif->tx_comms_area);
63226 +               free_vm_area(netif->rx_comms_area);
63227 +       }
63228 +
63229 +       free_netdev(netif->dev);
63230 +}
63231 diff -ruNp linux-2.6.19/drivers/xen/netback/loopback.c linux-2.6.19-xen-3.0.4/drivers/xen/netback/loopback.c
63232 --- linux-2.6.19/drivers/xen/netback/loopback.c 1970-01-01 00:00:00.000000000 +0000
63233 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netback/loopback.c       2007-02-02 19:10:45.000000000 +0000
63234 @@ -0,0 +1,320 @@
63235 +/******************************************************************************
63236 + * netback/loopback.c
63237 + * 
63238 + * A two-interface loopback device to emulate a local netfront-netback
63239 + * connection. This ensures that local packet delivery looks identical
63240 + * to inter-domain delivery. Most importantly, packets delivered locally
63241 + * originating from other domains will get *copied* when they traverse this
63242 + * driver. This prevents unbounded delays in socket-buffer queues from
63243 + * causing the netback driver to "seize up".
63244 + * 
63245 + * This driver creates a symmetric pair of loopback interfaces with names
63246 + * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
63247 + * bridge, just like a proper netback interface, while a local IP interface
63248 + * is configured on 'veth0'.
63249 + * 
63250 + * As with a real netback interface, vif0.0 is configured with a suitable
63251 + * dummy MAC address. No default is provided for veth0: a reasonable strategy
63252 + * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
63253 + * (to avoid confusing the Etherbridge).
63254 + * 
63255 + * Copyright (c) 2005 K A Fraser
63256 + * 
63257 + * This program is free software; you can redistribute it and/or
63258 + * modify it under the terms of the GNU General Public License version 2
63259 + * as published by the Free Software Foundation; or, when distributed
63260 + * separately from the Linux kernel or incorporated into other
63261 + * software packages, subject to the following license:
63262 + * 
63263 + * Permission is hereby granted, free of charge, to any person obtaining a copy
63264 + * of this source file (the "Software"), to deal in the Software without
63265 + * restriction, including without limitation the rights to use, copy, modify,
63266 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
63267 + * and to permit persons to whom the Software is furnished to do so, subject to
63268 + * the following conditions:
63269 + * 
63270 + * The above copyright notice and this permission notice shall be included in
63271 + * all copies or substantial portions of the Software.
63272 + * 
63273 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
63274 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63275 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
63276 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63277 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
63278 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
63279 + * IN THE SOFTWARE.
63280 + */
63281 +
63282 +#include <linux/module.h>
63283 +#include <linux/netdevice.h>
63284 +#include <linux/inetdevice.h>
63285 +#include <linux/etherdevice.h>
63286 +#include <linux/skbuff.h>
63287 +#include <linux/ethtool.h>
63288 +#include <net/dst.h>
63289 +#include <net/xfrm.h>          /* secpath_reset() */
63290 +#include <asm/hypervisor.h>    /* is_initial_xendomain() */
63291 +
63292 +static int nloopbacks = -1;
63293 +module_param(nloopbacks, int, 0);
63294 +MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
63295 +
63296 +struct net_private {
63297 +       struct net_device *loopback_dev;
63298 +       struct net_device_stats stats;
63299 +};
63300 +
63301 +static int loopback_open(struct net_device *dev)
63302 +{
63303 +       struct net_private *np = netdev_priv(dev);
63304 +       memset(&np->stats, 0, sizeof(np->stats));
63305 +       netif_start_queue(dev);
63306 +       return 0;
63307 +}
63308 +
63309 +static int loopback_close(struct net_device *dev)
63310 +{
63311 +       netif_stop_queue(dev);
63312 +       return 0;
63313 +}
63314 +
63315 +#ifdef CONFIG_X86
63316 +static int is_foreign(unsigned long pfn)
63317 +{
63318 +       /* NB. Play it safe for auto-translation mode. */
63319 +       return (xen_feature(XENFEAT_auto_translated_physmap) ||
63320 +               (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT));
63321 +}
63322 +#else
63323 +/* How to detect a foreign mapping? Play it safe. */
63324 +#define is_foreign(pfn)        (1)
63325 +#endif
63326 +
63327 +static int skb_remove_foreign_references(struct sk_buff *skb)
63328 +{
63329 +       struct page *page;
63330 +       unsigned long pfn;
63331 +       int i, off;
63332 +       char *vaddr;
63333 +
63334 +       BUG_ON(skb_shinfo(skb)->frag_list);
63335 +
63336 +       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
63337 +               pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page);
63338 +               if (!is_foreign(pfn))
63339 +                       continue;
63340 +               
63341 +               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
63342 +               if (unlikely(!page))
63343 +                       return 0;
63344 +
63345 +               vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
63346 +               off = skb_shinfo(skb)->frags[i].page_offset;
63347 +               memcpy(page_address(page) + off,
63348 +                      vaddr + off,
63349 +                      skb_shinfo(skb)->frags[i].size);
63350 +               kunmap_skb_frag(vaddr);
63351 +
63352 +               put_page(skb_shinfo(skb)->frags[i].page);
63353 +               skb_shinfo(skb)->frags[i].page = page;
63354 +       }
63355 +
63356 +       return 1;
63357 +}
63358 +
63359 +static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
63360 +{
63361 +       struct net_private *np = netdev_priv(dev);
63362 +
63363 +       if (!skb_remove_foreign_references(skb)) {
63364 +               np->stats.tx_dropped++;
63365 +               dev_kfree_skb(skb);
63366 +               return 0;
63367 +       }
63368 +
63369 +       dst_release(skb->dst);
63370 +       skb->dst = NULL;
63371 +
63372 +       skb_orphan(skb);
63373 +
63374 +       np->stats.tx_bytes += skb->len;
63375 +       np->stats.tx_packets++;
63376 +
63377 +       /* Switch to loopback context. */
63378 +       dev = np->loopback_dev;
63379 +       np  = netdev_priv(dev);
63380 +
63381 +       np->stats.rx_bytes += skb->len;
63382 +       np->stats.rx_packets++;
63383 +
63384 +       if (skb->ip_summed == CHECKSUM_PARTIAL) {
63385 +               /* Defer checksum calculation. */
63386 +               skb->proto_csum_blank = 1;
63387 +               /* Must be a local packet: assert its integrity. */
63388 +               skb->proto_data_valid = 1;
63389 +       }
63390 +
63391 +       skb->ip_summed = skb->proto_data_valid ?
63392 +               CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
63393 +
63394 +       skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
63395 +       skb->protocol = eth_type_trans(skb, dev);
63396 +       skb->dev      = dev;
63397 +       dev->last_rx  = jiffies;
63398 +
63399 +       /* Flush netfilter context: rx'ed skbuffs not expected to have any. */
63400 +       nf_reset(skb);
63401 +       secpath_reset(skb);
63402 +
63403 +       netif_rx(skb);
63404 +
63405 +       return 0;
63406 +}
63407 +
63408 +static struct net_device_stats *loopback_get_stats(struct net_device *dev)
63409 +{
63410 +       struct net_private *np = netdev_priv(dev);
63411 +       return &np->stats;
63412 +}
63413 +
63414 +static struct ethtool_ops network_ethtool_ops =
63415 +{
63416 +       .get_tx_csum = ethtool_op_get_tx_csum,
63417 +       .set_tx_csum = ethtool_op_set_tx_csum,
63418 +       .get_sg = ethtool_op_get_sg,
63419 +       .set_sg = ethtool_op_set_sg,
63420 +       .get_tso = ethtool_op_get_tso,
63421 +       .set_tso = ethtool_op_set_tso,
63422 +       .get_link = ethtool_op_get_link,
63423 +};
63424 +
63425 +/*
63426 + * Nothing to do here. Virtual interface is point-to-point and the
63427 + * physical interface is probably promiscuous anyway.
63428 + */
63429 +static void loopback_set_multicast_list(struct net_device *dev)
63430 +{
63431 +}
63432 +
63433 +static void loopback_construct(struct net_device *dev, struct net_device *lo)
63434 +{
63435 +       struct net_private *np = netdev_priv(dev);
63436 +
63437 +       np->loopback_dev     = lo;
63438 +
63439 +       dev->open            = loopback_open;
63440 +       dev->stop            = loopback_close;
63441 +       dev->hard_start_xmit = loopback_start_xmit;
63442 +       dev->get_stats       = loopback_get_stats;
63443 +       dev->set_multicast_list = loopback_set_multicast_list;
63444 +       dev->change_mtu      = NULL; /* allow arbitrary mtu */
63445 +
63446 +       dev->tx_queue_len    = 0;
63447 +
63448 +       dev->features        = (NETIF_F_HIGHDMA |
63449 +                               NETIF_F_LLTX |
63450 +                               NETIF_F_TSO |
63451 +                               NETIF_F_SG |
63452 +                               NETIF_F_IP_CSUM);
63453 +
63454 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
63455 +
63456 +       /*
63457 +        * We do not set a jumbo MTU on the interface. Otherwise the network
63458 +        * stack will try to send large packets that will get dropped by the
63459 +        * Ethernet bridge (unless the physical Ethernet interface is
63460 +        * configured to transfer jumbo packets). If a larger MTU is desired
63461 +        * then the system administrator can specify it using the 'ifconfig'
63462 +        * command.
63463 +        */
63464 +       /*dev->mtu             = 16*1024;*/
63465 +}
63466 +
63467 +static int __init make_loopback(int i)
63468 +{
63469 +       struct net_device *dev1, *dev2;
63470 +       char dev_name[IFNAMSIZ];
63471 +       int err = -ENOMEM;
63472 +
63473 +       sprintf(dev_name, "vif0.%d", i);
63474 +       dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
63475 +       if (!dev1)
63476 +               return err;
63477 +
63478 +       sprintf(dev_name, "veth%d", i);
63479 +       dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
63480 +       if (!dev2)
63481 +               goto fail_netdev2;
63482 +
63483 +       loopback_construct(dev1, dev2);
63484 +       loopback_construct(dev2, dev1);
63485 +
63486 +       /*
63487 +        * Initialise a dummy MAC address for the 'dummy backend' interface. We
63488 +        * choose the numerically largest non-broadcast address to prevent the
63489 +        * address getting stolen by an Ethernet bridge for STP purposes.
63490 +        */
63491 +       memset(dev1->dev_addr, 0xFF, ETH_ALEN);
63492 +       dev1->dev_addr[0] &= ~0x01;
63493 +
63494 +       if ((err = register_netdev(dev1)) != 0)
63495 +               goto fail;
63496 +
63497 +       if ((err = register_netdev(dev2)) != 0) {
63498 +               unregister_netdev(dev1);
63499 +               goto fail;
63500 +       }
63501 +
63502 +       return 0;
63503 +
63504 + fail:
63505 +       free_netdev(dev2);
63506 + fail_netdev2:
63507 +       free_netdev(dev1);
63508 +       return err;
63509 +}
63510 +
63511 +static void __exit clean_loopback(int i)
63512 +{
63513 +       struct net_device *dev1, *dev2;
63514 +       char dev_name[IFNAMSIZ];
63515 +
63516 +       sprintf(dev_name, "vif0.%d", i);
63517 +       dev1 = dev_get_by_name(dev_name);
63518 +       sprintf(dev_name, "veth%d", i);
63519 +       dev2 = dev_get_by_name(dev_name);
63520 +       if (dev1 && dev2) {
63521 +               unregister_netdev(dev2);
63522 +               unregister_netdev(dev1);
63523 +               free_netdev(dev2);
63524 +               free_netdev(dev1);
63525 +       }
63526 +}
63527 +
63528 +static int __init loopback_init(void)
63529 +{
63530 +       int i, err = 0;
63531 +
63532 +       if (nloopbacks == -1)
63533 +               nloopbacks = is_initial_xendomain() ? 4 : 0;
63534 +
63535 +       for (i = 0; i < nloopbacks; i++)
63536 +               if ((err = make_loopback(i)) != 0)
63537 +                       break;
63538 +
63539 +       return err;
63540 +}
63541 +
63542 +module_init(loopback_init);
63543 +
63544 +static void __exit loopback_exit(void)
63545 +{
63546 +       int i;
63547 +
63548 +       for (i = nloopbacks; i-- > 0; )
63549 +               clean_loopback(i);
63550 +}
63551 +
63552 +module_exit(loopback_exit);
63553 +
63554 +MODULE_LICENSE("Dual BSD/GPL");
63555 diff -ruNp linux-2.6.19/drivers/xen/netback/netback.c linux-2.6.19-xen-3.0.4/drivers/xen/netback/netback.c
63556 --- linux-2.6.19/drivers/xen/netback/netback.c  1970-01-01 00:00:00.000000000 +0000
63557 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netback/netback.c        2007-02-02 19:10:45.000000000 +0000
63558 @@ -0,0 +1,1524 @@
63559 +/******************************************************************************
63560 + * drivers/xen/netback/netback.c
63561 + * 
63562 + * Back-end of the driver for virtual network devices. This portion of the
63563 + * driver exports a 'unified' network-device interface that can be accessed
63564 + * by any operating system that implements a compatible front end. A 
63565 + * reference front-end implementation can be found in:
63566 + *  drivers/xen/netfront/netfront.c
63567 + * 
63568 + * Copyright (c) 2002-2005, K A Fraser
63569 + * 
63570 + * This program is free software; you can redistribute it and/or
63571 + * modify it under the terms of the GNU General Public License version 2
63572 + * as published by the Free Software Foundation; or, when distributed
63573 + * separately from the Linux kernel or incorporated into other
63574 + * software packages, subject to the following license:
63575 + * 
63576 + * Permission is hereby granted, free of charge, to any person obtaining a copy
63577 + * of this source file (the "Software"), to deal in the Software without
63578 + * restriction, including without limitation the rights to use, copy, modify,
63579 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
63580 + * and to permit persons to whom the Software is furnished to do so, subject to
63581 + * the following conditions:
63582 + * 
63583 + * The above copyright notice and this permission notice shall be included in
63584 + * all copies or substantial portions of the Software.
63585 + * 
63586 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
63587 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63588 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
63589 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63590 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
63591 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
63592 + * IN THE SOFTWARE.
63593 + */
63594 +
63595 +#include "common.h"
63596 +#include <xen/balloon.h>
63597 +#include <xen/interface/memory.h>
63598 +#include <asm/page.h>
63599 +
63600 +/*#define NETBE_DEBUG_INTERRUPT*/
63601 +
63602 +struct netbk_rx_meta {
63603 +       skb_frag_t frag;
63604 +       int id;
63605 +       int copy:1;
63606 +};
63607 +
63608 +static void netif_idx_release(u16 pending_idx);
63609 +static void netif_page_release(struct page *page);
63610 +static void make_tx_response(netif_t *netif, 
63611 +                            netif_tx_request_t *txp,
63612 +                            s8       st);
63613 +static netif_rx_response_t *make_rx_response(netif_t *netif, 
63614 +                                            u16      id, 
63615 +                                            s8       st,
63616 +                                            u16      offset,
63617 +                                            u16      size,
63618 +                                            u16      flags);
63619 +
63620 +static void net_tx_action(unsigned long unused);
63621 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
63622 +
63623 +static void net_rx_action(unsigned long unused);
63624 +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
63625 +
63626 +static struct timer_list net_timer;
63627 +
63628 +#define MAX_PENDING_REQS 256
63629 +
63630 +static struct sk_buff_head rx_queue;
63631 +
63632 +static struct page **mmap_pages;
63633 +static inline unsigned long idx_to_kaddr(unsigned int idx)
63634 +{
63635 +       return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx]));
63636 +}
63637 +
63638 +#define PKT_PROT_LEN 64
63639 +
63640 +static struct pending_tx_info {
63641 +       netif_tx_request_t req;
63642 +       netif_t *netif;
63643 +} pending_tx_info[MAX_PENDING_REQS];
63644 +static u16 pending_ring[MAX_PENDING_REQS];
63645 +typedef unsigned int PEND_RING_IDX;
63646 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
63647 +static PEND_RING_IDX pending_prod, pending_cons;
63648 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
63649 +
63650 +/* Freed TX SKBs get batched on this ring before return to pending_ring. */
63651 +static u16 dealloc_ring[MAX_PENDING_REQS];
63652 +static PEND_RING_IDX dealloc_prod, dealloc_cons;
63653 +
63654 +static struct sk_buff_head tx_queue;
63655 +
63656 +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
63657 +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
63658 +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
63659 +
63660 +static struct list_head net_schedule_list;
63661 +static spinlock_t net_schedule_list_lock;
63662 +
63663 +#define MAX_MFN_ALLOC 64
63664 +static unsigned long mfn_list[MAX_MFN_ALLOC];
63665 +static unsigned int alloc_index = 0;
63666 +
63667 +static inline unsigned long alloc_mfn(void)
63668 +{
63669 +       return mfn_list[--alloc_index];
63670 +}
63671 +
63672 +static int check_mfn(int nr)
63673 +{
63674 +       struct xen_memory_reservation reservation = {
63675 +               .extent_order = 0,
63676 +               .domid        = DOMID_SELF
63677 +       };
63678 +
63679 +       if (likely(alloc_index >= nr))
63680 +               return 0;
63681 +
63682 +       set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
63683 +       reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
63684 +       alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation,
63685 +                                           &reservation);
63686 +
63687 +       return alloc_index >= nr ? 0 : -ENOMEM;
63688 +}
63689 +
63690 +static inline void maybe_schedule_tx_action(void)
63691 +{
63692 +       smp_mb();
63693 +       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
63694 +           !list_empty(&net_schedule_list))
63695 +               tasklet_schedule(&net_tx_tasklet);
63696 +}
63697 +
63698 +/*
63699 + * A gross way of confirming the origin of an skb data page. The slab
63700 + * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
63701 + */
63702 +static inline int is_xen_skb(struct sk_buff *skb)
63703 +{
63704 +       extern kmem_cache_t *skbuff_cachep;
63705 +       kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
63706 +       return (cp == skbuff_cachep);
63707 +}
63708 +
63709 +/*
63710 + * We can flip without copying the packet unless:
63711 + *  1. The data is not allocated from our special cache; or
63712 + *  2. The main data area is shared; or
63713 + *  3. One or more fragments are shared; or
63714 + *  4. There are chained fragments.
63715 + */
63716 +static inline int is_flippable_skb(struct sk_buff *skb)
63717 +{
63718 +       int frag;
63719 +
63720 +       if (!is_xen_skb(skb) || skb_cloned(skb))
63721 +               return 0;
63722 +
63723 +       for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
63724 +               if (page_count(skb_shinfo(skb)->frags[frag].page) > 1)
63725 +                       return 0;
63726 +       }
63727 +
63728 +       if (skb_shinfo(skb)->frag_list != NULL)
63729 +               return 0;
63730 +
63731 +       return 1;
63732 +}
63733 +
63734 +static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
63735 +{
63736 +       struct skb_shared_info *ninfo;
63737 +       struct sk_buff *nskb;
63738 +       unsigned long offset;
63739 +       int ret;
63740 +       int len;
63741 +       int headlen;
63742 +
63743 +       BUG_ON(skb_shinfo(skb)->frag_list != NULL);
63744 +
63745 +       nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
63746 +       if (unlikely(!nskb))
63747 +               goto err;
63748 +
63749 +       skb_reserve(nskb, 16 + NET_IP_ALIGN);
63750 +       headlen = nskb->end - nskb->data;
63751 +       if (headlen > skb_headlen(skb))
63752 +               headlen = skb_headlen(skb);
63753 +       ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
63754 +       BUG_ON(ret);
63755 +
63756 +       ninfo = skb_shinfo(nskb);
63757 +       ninfo->gso_size = skb_shinfo(skb)->gso_size;
63758 +       ninfo->gso_type = skb_shinfo(skb)->gso_type;
63759 +
63760 +       offset = headlen;
63761 +       len = skb->len - headlen;
63762 +
63763 +       nskb->len = skb->len;
63764 +       nskb->data_len = len;
63765 +       nskb->truesize += len;
63766 +
63767 +       while (len) {
63768 +               struct page *page;
63769 +               int copy;
63770 +               int zero;
63771 +
63772 +               if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
63773 +                       dump_stack();
63774 +                       goto err_free;
63775 +               }
63776 +
63777 +               copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
63778 +               zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
63779 +
63780 +               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
63781 +               if (unlikely(!page))
63782 +                       goto err_free;
63783 +
63784 +               ret = skb_copy_bits(skb, offset, page_address(page), copy);
63785 +               BUG_ON(ret);
63786 +
63787 +               ninfo->frags[ninfo->nr_frags].page = page;
63788 +               ninfo->frags[ninfo->nr_frags].page_offset = 0;
63789 +               ninfo->frags[ninfo->nr_frags].size = copy;
63790 +               ninfo->nr_frags++;
63791 +
63792 +               offset += copy;
63793 +               len -= copy;
63794 +       }
63795 +
63796 +       offset = nskb->data - skb->data;
63797 +
63798 +       nskb->h.raw = skb->h.raw + offset;
63799 +       nskb->nh.raw = skb->nh.raw + offset;
63800 +       nskb->mac.raw = skb->mac.raw + offset;
63801 +
63802 +       return nskb;
63803 +
63804 + err_free:
63805 +       kfree_skb(nskb);
63806 + err:
63807 +       return NULL;
63808 +}
63809 +
63810 +static inline int netbk_max_required_rx_slots(netif_t *netif)
63811 +{
63812 +       if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
63813 +               return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
63814 +       return 1; /* all in one */
63815 +}
63816 +
63817 +static inline int netbk_queue_full(netif_t *netif)
63818 +{
63819 +       RING_IDX peek   = netif->rx_req_cons_peek;
63820 +       RING_IDX needed = netbk_max_required_rx_slots(netif);
63821 +
63822 +       return ((netif->rx.sring->req_prod - peek) < needed) ||
63823 +              ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
63824 +}
63825 +
63826 +static void tx_queue_callback(unsigned long data)
63827 +{
63828 +       netif_t *netif = (netif_t *)data;
63829 +       if (netif_schedulable(netif->dev))
63830 +               netif_wake_queue(netif->dev);
63831 +}
63832 +
63833 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
63834 +{
63835 +       netif_t *netif = netdev_priv(dev);
63836 +
63837 +       BUG_ON(skb->dev != dev);
63838 +
63839 +       /* Drop the packet if the target domain has no receive buffers. */
63840 +       if (unlikely(!netif_schedulable(dev) || netbk_queue_full(netif)))
63841 +               goto drop;
63842 +
63843 +       /*
63844 +        * Copy the packet here if it's destined for a flipping interface
63845 +        * but isn't flippable (e.g. extra references to data).
63846 +        */
63847 +       if (!netif->copying_receiver && !is_flippable_skb(skb)) {
63848 +               struct sk_buff *nskb = netbk_copy_skb(skb);
63849 +               if ( unlikely(nskb == NULL) )
63850 +                       goto drop;
63851 +               /* Copy only the header fields we use in this driver. */
63852 +               nskb->dev = skb->dev;
63853 +               nskb->ip_summed = skb->ip_summed;
63854 +               nskb->proto_data_valid = skb->proto_data_valid;
63855 +               dev_kfree_skb(skb);
63856 +               skb = nskb;
63857 +       }
63858 +
63859 +       netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
63860 +                                  !!skb_shinfo(skb)->gso_size;
63861 +       netif_get(netif);
63862 +
63863 +       if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
63864 +               netif->rx.sring->req_event = netif->rx_req_cons_peek +
63865 +                       netbk_max_required_rx_slots(netif);
63866 +               mb(); /* request notification /then/ check & stop the queue */
63867 +               if (netbk_queue_full(netif)) {
63868 +                       netif_stop_queue(dev);
63869 +                       /*
63870 +                        * Schedule 500ms timeout to restart the queue, thus
63871 +                        * ensuring that an inactive queue will be drained.
63872 +                        * Packets will be immediately be dropped until more
63873 +                        * receive buffers become available (see
63874 +                        * netbk_queue_full() check above).
63875 +                        */
63876 +                       netif->tx_queue_timeout.data = (unsigned long)netif;
63877 +                       netif->tx_queue_timeout.function = tx_queue_callback;
63878 +                       __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
63879 +               }
63880 +       }
63881 +
63882 +       skb_queue_tail(&rx_queue, skb);
63883 +       tasklet_schedule(&net_rx_tasklet);
63884 +
63885 +       return 0;
63886 +
63887 + drop:
63888 +       netif->stats.tx_dropped++;
63889 +       dev_kfree_skb(skb);
63890 +       return 0;
63891 +}
63892 +
63893 +#if 0
63894 +static void xen_network_done_notify(void)
63895 +{
63896 +       static struct net_device *eth0_dev = NULL;
63897 +       if (unlikely(eth0_dev == NULL))
63898 +               eth0_dev = __dev_get_by_name("eth0");
63899 +       netif_rx_schedule(eth0_dev);
63900 +}
63901 +/* 
63902 + * Add following to poll() function in NAPI driver (Tigon3 is example):
63903 + *  if ( xen_network_done() )
63904 + *      tg3_enable_ints(tp);
63905 + */
63906 +int xen_network_done(void)
63907 +{
63908 +       return skb_queue_empty(&rx_queue);
63909 +}
63910 +#endif
63911 +
63912 +struct netrx_pending_operations {
63913 +       unsigned trans_prod, trans_cons;
63914 +       unsigned mmu_prod, mmu_cons;
63915 +       unsigned mcl_prod, mcl_cons;
63916 +       unsigned copy_prod, copy_cons;
63917 +       unsigned meta_prod, meta_cons;
63918 +       mmu_update_t *mmu;
63919 +       gnttab_transfer_t *trans;
63920 +       gnttab_copy_t *copy;
63921 +       multicall_entry_t *mcl;
63922 +       struct netbk_rx_meta *meta;
63923 +};
63924 +
63925 +/* Set up the grant operations for this fragment.  If it's a flipping
63926 +   interface, we also set up the unmap request from here. */
63927 +static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
63928 +                         int i, struct netrx_pending_operations *npo,
63929 +                         struct page *page, unsigned long size,
63930 +                         unsigned long offset)
63931 +{
63932 +       mmu_update_t *mmu;
63933 +       gnttab_transfer_t *gop;
63934 +       gnttab_copy_t *copy_gop;
63935 +       multicall_entry_t *mcl;
63936 +       netif_rx_request_t *req;
63937 +       unsigned long old_mfn, new_mfn;
63938 +
63939 +       old_mfn = virt_to_mfn(page_address(page));
63940 +
63941 +       req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
63942 +       if (netif->copying_receiver) {
63943 +               /* The fragment needs to be copied rather than
63944 +                  flipped. */
63945 +               meta->copy = 1;
63946 +               copy_gop = npo->copy + npo->copy_prod++;
63947 +               copy_gop->flags = GNTCOPY_dest_gref;
63948 +               if (PageForeign(page)) {
63949 +                       struct pending_tx_info *src_pend =
63950 +                               &pending_tx_info[page->index];
63951 +                       copy_gop->source.domid = src_pend->netif->domid;
63952 +                       copy_gop->source.u.ref = src_pend->req.gref;
63953 +                       copy_gop->flags |= GNTCOPY_source_gref;
63954 +               } else {
63955 +                       copy_gop->source.domid = DOMID_SELF;
63956 +                       copy_gop->source.u.gmfn = old_mfn;
63957 +               }
63958 +               copy_gop->source.offset = offset;
63959 +               copy_gop->dest.domid = netif->domid;
63960 +               copy_gop->dest.offset = 0;
63961 +               copy_gop->dest.u.ref = req->gref;
63962 +               copy_gop->len = size;
63963 +       } else {
63964 +               meta->copy = 0;
63965 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
63966 +                       new_mfn = alloc_mfn();
63967 +
63968 +                       /*
63969 +                        * Set the new P2M table entry before
63970 +                        * reassigning the old data page. Heed the
63971 +                        * comment in pgtable-2level.h:pte_page(). :-)
63972 +                        */
63973 +                       set_phys_to_machine(page_to_pfn(page), new_mfn);
63974 +
63975 +                       mcl = npo->mcl + npo->mcl_prod++;
63976 +                       MULTI_update_va_mapping(mcl,
63977 +                                            (unsigned long)page_address(page),
63978 +                                            pfn_pte_ma(new_mfn, PAGE_KERNEL),
63979 +                                            0);
63980 +
63981 +                       mmu = npo->mmu + npo->mmu_prod++;
63982 +                       mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
63983 +                               MMU_MACHPHYS_UPDATE;
63984 +                       mmu->val = page_to_pfn(page);
63985 +               }
63986 +
63987 +               gop = npo->trans + npo->trans_prod++;
63988 +               gop->mfn = old_mfn;
63989 +               gop->domid = netif->domid;
63990 +               gop->ref = req->gref;
63991 +       }
63992 +       return req->id;
63993 +}
63994 +
63995 +static void netbk_gop_skb(struct sk_buff *skb,
63996 +                         struct netrx_pending_operations *npo)
63997 +{
63998 +       netif_t *netif = netdev_priv(skb->dev);
63999 +       int nr_frags = skb_shinfo(skb)->nr_frags;
64000 +       int i;
64001 +       int extra;
64002 +       struct netbk_rx_meta *head_meta, *meta;
64003 +
64004 +       head_meta = npo->meta + npo->meta_prod++;
64005 +       head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
64006 +       head_meta->frag.size = skb_shinfo(skb)->gso_size;
64007 +       extra = !!head_meta->frag.size + 1;
64008 +
64009 +       for (i = 0; i < nr_frags; i++) {
64010 +               meta = npo->meta + npo->meta_prod++;
64011 +               meta->frag = skb_shinfo(skb)->frags[i];
64012 +               meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
64013 +                                         meta->frag.page,
64014 +                                         meta->frag.size,
64015 +                                         meta->frag.page_offset);
64016 +       }
64017 +
64018 +       /*
64019 +        * This must occur at the end to ensure that we don't trash
64020 +        * skb_shinfo until we're done.
64021 +        */
64022 +       head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
64023 +                                      virt_to_page(skb->data),
64024 +                                      skb_headlen(skb),
64025 +                                      offset_in_page(skb->data));
64026 +
64027 +       netif->rx.req_cons += nr_frags + extra;
64028 +}
64029 +
64030 +static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
64031 +{
64032 +       int i;
64033 +
64034 +       for (i = 0; i < nr_frags; i++)
64035 +               put_page(meta[i].frag.page);
64036 +}
64037 +
64038 +/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
64039 +   used to set up the operations on the top of
64040 +   netrx_pending_operations, which have since been done.  Check that
64041 +   they didn't give any errors and advance over them. */
64042 +static int netbk_check_gop(int nr_frags, domid_t domid,
64043 +                          struct netrx_pending_operations *npo)
64044 +{
64045 +       multicall_entry_t *mcl;
64046 +       gnttab_transfer_t *gop;
64047 +       gnttab_copy_t     *copy_op;
64048 +       int status = NETIF_RSP_OKAY;
64049 +       int i;
64050 +
64051 +       for (i = 0; i <= nr_frags; i++) {
64052 +               if (npo->meta[npo->meta_cons + i].copy) {
64053 +                       copy_op = npo->copy + npo->copy_cons++;
64054 +                       if (copy_op->status != GNTST_okay) {
64055 +                               DPRINTK("Bad status %d from copy to DOM%d.\n",
64056 +                                       copy_op->status, domid);
64057 +                               status = NETIF_RSP_ERROR;
64058 +                       }
64059 +               } else {
64060 +                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
64061 +                               mcl = npo->mcl + npo->mcl_cons++;
64062 +                               /* The update_va_mapping() must not fail. */
64063 +                               BUG_ON(mcl->result != 0);
64064 +                       }
64065 +
64066 +                       gop = npo->trans + npo->trans_cons++;
64067 +                       /* Check the reassignment error code. */
64068 +                       if (gop->status != 0) {
64069 +                               DPRINTK("Bad status %d from grant transfer to DOM%u\n",
64070 +                                       gop->status, domid);
64071 +                               /*
64072 +                                * Page no longer belongs to us unless
64073 +                                * GNTST_bad_page, but that should be
64074 +                                * a fatal error anyway.
64075 +                                */
64076 +                               BUG_ON(gop->status == GNTST_bad_page);
64077 +                               status = NETIF_RSP_ERROR;
64078 +                       }
64079 +               }
64080 +       }
64081 +
64082 +       return status;
64083 +}
64084 +
64085 +static void netbk_add_frag_responses(netif_t *netif, int status,
64086 +                                    struct netbk_rx_meta *meta, int nr_frags)
64087 +{
64088 +       int i;
64089 +       unsigned long offset;
64090 +
64091 +       for (i = 0; i < nr_frags; i++) {
64092 +               int id = meta[i].id;
64093 +               int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
64094 +
64095 +               if (meta[i].copy)
64096 +                       offset = 0;
64097 +               else
64098 +                       offset = meta[i].frag.page_offset;
64099 +               make_rx_response(netif, id, status, offset,
64100 +                                meta[i].frag.size, flags);
64101 +       }
64102 +}
64103 +
64104 +static void net_rx_action(unsigned long unused)
64105 +{
64106 +       netif_t *netif = NULL;
64107 +       s8 status;
64108 +       u16 id, irq, flags;
64109 +       netif_rx_response_t *resp;
64110 +       multicall_entry_t *mcl;
64111 +       struct sk_buff_head rxq;
64112 +       struct sk_buff *skb;
64113 +       int notify_nr = 0;
64114 +       int ret;
64115 +       int nr_frags;
64116 +       int count;
64117 +       unsigned long offset;
64118 +
64119 +       /*
64120 +        * Putting hundreds of bytes on the stack is considered rude.
64121 +        * Static works because a tasklet can only be on one CPU at any time.
64122 +        */
64123 +       static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
64124 +       static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
64125 +       static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
64126 +       static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
64127 +       static unsigned char rx_notify[NR_IRQS];
64128 +       static u16 notify_list[NET_RX_RING_SIZE];
64129 +       static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
64130 +
64131 +       struct netrx_pending_operations npo = {
64132 +               mmu: rx_mmu,
64133 +               trans: grant_trans_op,
64134 +               copy: grant_copy_op,
64135 +               mcl: rx_mcl,
64136 +               meta: meta};
64137 +
64138 +       skb_queue_head_init(&rxq);
64139 +
64140 +       count = 0;
64141 +
64142 +       while ((skb = skb_dequeue(&rx_queue)) != NULL) {
64143 +               nr_frags = skb_shinfo(skb)->nr_frags;
64144 +               *(int *)skb->cb = nr_frags;
64145 +
64146 +               if (!xen_feature(XENFEAT_auto_translated_physmap) &&
64147 +                   check_mfn(nr_frags + 1)) {
64148 +                       /* Memory squeeze? Back off for an arbitrary while. */
64149 +                       if ( net_ratelimit() )
64150 +                               WPRINTK("Memory squeeze in netback "
64151 +                                       "driver.\n");
64152 +                       mod_timer(&net_timer, jiffies + HZ);
64153 +                       skb_queue_head(&rx_queue, skb);
64154 +                       break;
64155 +               }
64156 +
64157 +               netbk_gop_skb(skb, &npo);
64158 +
64159 +               count += nr_frags + 1;
64160 +
64161 +               __skb_queue_tail(&rxq, skb);
64162 +
64163 +               /* Filled the batch queue? */
64164 +               if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
64165 +                       break;
64166 +       }
64167 +
64168 +       if (npo.mcl_prod &&
64169 +           !xen_feature(XENFEAT_auto_translated_physmap)) {
64170 +               mcl = npo.mcl + npo.mcl_prod++;
64171 +
64172 +               BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
64173 +               mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
64174 +
64175 +               mcl->op = __HYPERVISOR_mmu_update;
64176 +               mcl->args[0] = (unsigned long)rx_mmu;
64177 +               mcl->args[1] = npo.mmu_prod;
64178 +               mcl->args[2] = 0;
64179 +               mcl->args[3] = DOMID_SELF;
64180 +       }
64181 +
64182 +       if (npo.trans_prod) {
64183 +               mcl = npo.mcl + npo.mcl_prod++;
64184 +               mcl->op = __HYPERVISOR_grant_table_op;
64185 +               mcl->args[0] = GNTTABOP_transfer;
64186 +               mcl->args[1] = (unsigned long)grant_trans_op;
64187 +               mcl->args[2] = npo.trans_prod;
64188 +       }
64189 +
64190 +       if (npo.copy_prod) {
64191 +               mcl = npo.mcl + npo.mcl_prod++;
64192 +               mcl->op = __HYPERVISOR_grant_table_op;
64193 +               mcl->args[0] = GNTTABOP_copy;
64194 +               mcl->args[1] = (unsigned long)grant_copy_op;
64195 +               mcl->args[2] = npo.copy_prod;
64196 +       }
64197 +
64198 +       /* Nothing to do? */
64199 +       if (!npo.mcl_prod)
64200 +               return;
64201 +
64202 +       BUG_ON(npo.copy_prod > NET_RX_RING_SIZE);
64203 +       BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE);
64204 +       BUG_ON(npo.trans_prod > NET_RX_RING_SIZE);
64205 +       BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3);
64206 +       BUG_ON(npo.meta_prod > NET_RX_RING_SIZE);
64207 +
64208 +       ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
64209 +       BUG_ON(ret != 0);
64210 +
64211 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
64212 +               nr_frags = *(int *)skb->cb;
64213 +
64214 +               netif = netdev_priv(skb->dev);
64215 +               /* We can't rely on skb_release_data to release the
64216 +                  pages used by fragments for us, since it tries to
64217 +                  touch the pages in the fraglist.  If we're in
64218 +                  flipping mode, that doesn't work.  In copying mode,
64219 +                  we still have access to all of the pages, and so
64220 +                  it's safe to let release_data deal with it. */
64221 +               /* (Freeing the fragments is safe since we copy
64222 +                  non-linear skbs destined for flipping interfaces) */
64223 +               if (!netif->copying_receiver) {
64224 +                       atomic_set(&(skb_shinfo(skb)->dataref), 1);
64225 +                       skb_shinfo(skb)->frag_list = NULL;
64226 +                       skb_shinfo(skb)->nr_frags = 0;
64227 +                       netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
64228 +               }
64229 +
64230 +               netif->stats.tx_bytes += skb->len;
64231 +               netif->stats.tx_packets++;
64232 +
64233 +               status = netbk_check_gop(nr_frags, netif->domid, &npo);
64234 +
64235 +               id = meta[npo.meta_cons].id;
64236 +               flags = nr_frags ? NETRXF_more_data : 0;
64237 +
64238 +               if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
64239 +                       flags |= NETRXF_csum_blank | NETRXF_data_validated;
64240 +               else if (skb->proto_data_valid) /* remote but checksummed? */
64241 +                       flags |= NETRXF_data_validated;
64242 +
64243 +               if (meta[npo.meta_cons].copy)
64244 +                       offset = 0;
64245 +               else
64246 +                       offset = offset_in_page(skb->data);
64247 +               resp = make_rx_response(netif, id, status, offset,
64248 +                                       skb_headlen(skb), flags);
64249 +
64250 +               if (meta[npo.meta_cons].frag.size) {
64251 +                       struct netif_extra_info *gso =
64252 +                               (struct netif_extra_info *)
64253 +                               RING_GET_RESPONSE(&netif->rx,
64254 +                                                 netif->rx.rsp_prod_pvt++);
64255 +
64256 +                       resp->flags |= NETRXF_extra_info;
64257 +
64258 +                       gso->u.gso.size = meta[npo.meta_cons].frag.size;
64259 +                       gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
64260 +                       gso->u.gso.pad = 0;
64261 +                       gso->u.gso.features = 0;
64262 +
64263 +                       gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
64264 +                       gso->flags = 0;
64265 +               }
64266 +
64267 +               netbk_add_frag_responses(netif, status,
64268 +                                        meta + npo.meta_cons + 1,
64269 +                                        nr_frags);
64270 +
64271 +               RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
64272 +               irq = netif->irq;
64273 +               if (ret && !rx_notify[irq]) {
64274 +                       rx_notify[irq] = 1;
64275 +                       notify_list[notify_nr++] = irq;
64276 +               }
64277 +
64278 +               if (netif_queue_stopped(netif->dev) &&
64279 +                   netif_schedulable(netif->dev) &&
64280 +                   !netbk_queue_full(netif))
64281 +                       netif_wake_queue(netif->dev);
64282 +
64283 +               netif_put(netif);
64284 +               dev_kfree_skb(skb);
64285 +               npo.meta_cons += nr_frags + 1;
64286 +       }
64287 +
64288 +       while (notify_nr != 0) {
64289 +               irq = notify_list[--notify_nr];
64290 +               rx_notify[irq] = 0;
64291 +               notify_remote_via_irq(irq);
64292 +       }
64293 +
64294 +       /* More work to do? */
64295 +       if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
64296 +               tasklet_schedule(&net_rx_tasklet);
64297 +#if 0
64298 +       else
64299 +               xen_network_done_notify();
64300 +#endif
64301 +}
64302 +
64303 +static void net_alarm(unsigned long unused)
64304 +{
64305 +       tasklet_schedule(&net_rx_tasklet);
64306 +}
64307 +
64308 +struct net_device_stats *netif_be_get_stats(struct net_device *dev)
64309 +{
64310 +       netif_t *netif = netdev_priv(dev);
64311 +       return &netif->stats;
64312 +}
64313 +
64314 +static int __on_net_schedule_list(netif_t *netif)
64315 +{
64316 +       return netif->list.next != NULL;
64317 +}
64318 +
64319 +static void remove_from_net_schedule_list(netif_t *netif)
64320 +{
64321 +       spin_lock_irq(&net_schedule_list_lock);
64322 +       if (likely(__on_net_schedule_list(netif))) {
64323 +               list_del(&netif->list);
64324 +               netif->list.next = NULL;
64325 +               netif_put(netif);
64326 +       }
64327 +       spin_unlock_irq(&net_schedule_list_lock);
64328 +}
64329 +
64330 +static void add_to_net_schedule_list_tail(netif_t *netif)
64331 +{
64332 +       if (__on_net_schedule_list(netif))
64333 +               return;
64334 +
64335 +       spin_lock_irq(&net_schedule_list_lock);
64336 +       if (!__on_net_schedule_list(netif) &&
64337 +           likely(netif_schedulable(netif->dev))) {
64338 +               list_add_tail(&netif->list, &net_schedule_list);
64339 +               netif_get(netif);
64340 +       }
64341 +       spin_unlock_irq(&net_schedule_list_lock);
64342 +}
64343 +
64344 +/*
64345 + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
64346 + * If this driver is pipelining transmit requests then we can be very
64347 + * aggressive in avoiding new-packet notifications -- frontend only needs to
64348 + * send a notification if there are no outstanding unreceived responses.
64349 + * If we may be buffer transmit buffers for any reason then we must be rather
64350 + * more conservative and treat this as the final check for pending work.
64351 + */
64352 +void netif_schedule_work(netif_t *netif)
64353 +{
64354 +       int more_to_do;
64355 +
64356 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
64357 +       more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
64358 +#else
64359 +       RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
64360 +#endif
64361 +
64362 +       if (more_to_do) {
64363 +               add_to_net_schedule_list_tail(netif);
64364 +               maybe_schedule_tx_action();
64365 +       }
64366 +}
64367 +
64368 +void netif_deschedule_work(netif_t *netif)
64369 +{
64370 +       remove_from_net_schedule_list(netif);
64371 +}
64372 +
64373 +
64374 +static void tx_add_credit(netif_t *netif)
64375 +{
64376 +       unsigned long max_burst, max_credit;
64377 +
64378 +       /*
64379 +        * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
64380 +        * Otherwise the interface can seize up due to insufficient credit.
64381 +        */
64382 +       max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
64383 +       max_burst = min(max_burst, 131072UL);
64384 +       max_burst = max(max_burst, netif->credit_bytes);
64385 +
64386 +       /* Take care that adding a new chunk of credit doesn't wrap to zero. */
64387 +       max_credit = netif->remaining_credit + netif->credit_bytes;
64388 +       if (max_credit < netif->remaining_credit)
64389 +               max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
64390 +
64391 +       netif->remaining_credit = min(max_credit, max_burst);
64392 +}
64393 +
64394 +static void tx_credit_callback(unsigned long data)
64395 +{
64396 +       netif_t *netif = (netif_t *)data;
64397 +       tx_add_credit(netif);
64398 +       netif_schedule_work(netif);
64399 +}
64400 +
64401 +inline static void net_tx_action_dealloc(void)
64402 +{
64403 +       gnttab_unmap_grant_ref_t *gop;
64404 +       u16 pending_idx;
64405 +       PEND_RING_IDX dc, dp;
64406 +       netif_t *netif;
64407 +       int ret;
64408 +
64409 +       dc = dealloc_cons;
64410 +       dp = dealloc_prod;
64411 +
64412 +       /* Ensure we see all indexes enqueued by netif_idx_release(). */
64413 +       smp_rmb();
64414 +
64415 +       /*
64416 +        * Free up any grants we have finished using
64417 +        */
64418 +       gop = tx_unmap_ops;
64419 +       while (dc != dp) {
64420 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
64421 +               gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
64422 +                                   GNTMAP_host_map,
64423 +                                   grant_tx_handle[pending_idx]);
64424 +               gop++;
64425 +       }
64426 +       ret = HYPERVISOR_grant_table_op(
64427 +               GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
64428 +       BUG_ON(ret);
64429 +
64430 +       while (dealloc_cons != dp) {
64431 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
64432 +
64433 +               netif = pending_tx_info[pending_idx].netif;
64434 +
64435 +               make_tx_response(netif, &pending_tx_info[pending_idx].req, 
64436 +                                NETIF_RSP_OKAY);
64437 +
64438 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
64439 +
64440 +               netif_put(netif);
64441 +       }
64442 +}
64443 +
64444 +static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
64445 +{
64446 +       RING_IDX cons = netif->tx.req_cons;
64447 +
64448 +       do {
64449 +               make_tx_response(netif, txp, NETIF_RSP_ERROR);
64450 +               if (cons >= end)
64451 +                       break;
64452 +               txp = RING_GET_REQUEST(&netif->tx, cons++);
64453 +       } while (1);
64454 +       netif->tx.req_cons = cons;
64455 +       netif_schedule_work(netif);
64456 +       netif_put(netif);
64457 +}
64458 +
64459 +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
64460 +                               netif_tx_request_t *txp, int work_to_do)
64461 +{
64462 +       RING_IDX cons = netif->tx.req_cons;
64463 +       int frags = 0;
64464 +
64465 +       if (!(first->flags & NETTXF_more_data))
64466 +               return 0;
64467 +
64468 +       do {
64469 +               if (frags >= work_to_do) {
64470 +                       DPRINTK("Need more frags\n");
64471 +                       return -frags;
64472 +               }
64473 +
64474 +               if (unlikely(frags >= MAX_SKB_FRAGS)) {
64475 +                       DPRINTK("Too many frags\n");
64476 +                       return -frags;
64477 +               }
64478 +
64479 +               memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
64480 +                      sizeof(*txp));
64481 +               if (txp->size > first->size) {
64482 +                       DPRINTK("Frags galore\n");
64483 +                       return -frags;
64484 +               }
64485 +
64486 +               first->size -= txp->size;
64487 +               frags++;
64488 +
64489 +               if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
64490 +                       DPRINTK("txp->offset: %x, size: %u\n",
64491 +                               txp->offset, txp->size);
64492 +                       return -frags;
64493 +               }
64494 +       } while ((txp++)->flags & NETTXF_more_data);
64495 +
64496 +       return frags;
64497 +}
64498 +
64499 +static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
64500 +                                                 struct sk_buff *skb,
64501 +                                                 netif_tx_request_t *txp,
64502 +                                                 gnttab_map_grant_ref_t *mop)
64503 +{
64504 +       struct skb_shared_info *shinfo = skb_shinfo(skb);
64505 +       skb_frag_t *frags = shinfo->frags;
64506 +       unsigned long pending_idx = *((u16 *)skb->data);
64507 +       int i, start;
64508 +
64509 +       /* Skip first skb fragment if it is on same page as header fragment. */
64510 +       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
64511 +
64512 +       for (i = start; i < shinfo->nr_frags; i++, txp++) {
64513 +               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
64514 +
64515 +               gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
64516 +                                 GNTMAP_host_map | GNTMAP_readonly,
64517 +                                 txp->gref, netif->domid);
64518 +
64519 +               memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
64520 +               netif_get(netif);
64521 +               pending_tx_info[pending_idx].netif = netif;
64522 +               frags[i].page = (void *)pending_idx;
64523 +       }
64524 +
64525 +       return mop;
64526 +}
64527 +
64528 +static int netbk_tx_check_mop(struct sk_buff *skb,
64529 +                              gnttab_map_grant_ref_t **mopp)
64530 +{
64531 +       gnttab_map_grant_ref_t *mop = *mopp;
64532 +       int pending_idx = *((u16 *)skb->data);
64533 +       netif_t *netif = pending_tx_info[pending_idx].netif;
64534 +       netif_tx_request_t *txp;
64535 +       struct skb_shared_info *shinfo = skb_shinfo(skb);
64536 +       int nr_frags = shinfo->nr_frags;
64537 +       int i, err, start;
64538 +
64539 +       /* Check status of header. */
64540 +       err = mop->status;
64541 +       if (unlikely(err)) {
64542 +               txp = &pending_tx_info[pending_idx].req;
64543 +               make_tx_response(netif, txp, NETIF_RSP_ERROR);
64544 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
64545 +               netif_put(netif);
64546 +       } else {
64547 +               set_phys_to_machine(
64548 +                       __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
64549 +                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
64550 +               grant_tx_handle[pending_idx] = mop->handle;
64551 +       }
64552 +
64553 +       /* Skip first skb fragment if it is on same page as header fragment. */
64554 +       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
64555 +
64556 +       for (i = start; i < nr_frags; i++) {
64557 +               int j, newerr;
64558 +
64559 +               pending_idx = (unsigned long)shinfo->frags[i].page;
64560 +
64561 +               /* Check error status: if okay then remember grant handle. */
64562 +               newerr = (++mop)->status;
64563 +               if (likely(!newerr)) {
64564 +                       set_phys_to_machine(
64565 +                               __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
64566 +                               FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
64567 +                       grant_tx_handle[pending_idx] = mop->handle;
64568 +                       /* Had a previous error? Invalidate this fragment. */
64569 +                       if (unlikely(err))
64570 +                               netif_idx_release(pending_idx);
64571 +                       continue;
64572 +               }
64573 +
64574 +               /* Error on this fragment: respond to client with an error. */
64575 +               txp = &pending_tx_info[pending_idx].req;
64576 +               make_tx_response(netif, txp, NETIF_RSP_ERROR);
64577 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
64578 +               netif_put(netif);
64579 +
64580 +               /* Not the first error? Preceding frags already invalidated. */
64581 +               if (err)
64582 +                       continue;
64583 +
64584 +               /* First error: invalidate header and preceding fragments. */
64585 +               pending_idx = *((u16 *)skb->data);
64586 +               netif_idx_release(pending_idx);
64587 +               for (j = start; j < i; j++) {
64588 +                       pending_idx = (unsigned long)shinfo->frags[i].page;
64589 +                       netif_idx_release(pending_idx);
64590 +               }
64591 +
64592 +               /* Remember the error: invalidate all subsequent fragments. */
64593 +               err = newerr;
64594 +       }
64595 +
64596 +       *mopp = mop + 1;
64597 +       return err;
64598 +}
64599 +
64600 +static void netbk_fill_frags(struct sk_buff *skb)
64601 +{
64602 +       struct skb_shared_info *shinfo = skb_shinfo(skb);
64603 +       int nr_frags = shinfo->nr_frags;
64604 +       int i;
64605 +
64606 +       for (i = 0; i < nr_frags; i++) {
64607 +               skb_frag_t *frag = shinfo->frags + i;
64608 +               netif_tx_request_t *txp;
64609 +               unsigned long pending_idx;
64610 +
64611 +               pending_idx = (unsigned long)frag->page;
64612 +               txp = &pending_tx_info[pending_idx].req;
64613 +               frag->page = virt_to_page(idx_to_kaddr(pending_idx));
64614 +               frag->size = txp->size;
64615 +               frag->page_offset = txp->offset;
64616 +
64617 +               skb->len += txp->size;
64618 +               skb->data_len += txp->size;
64619 +               skb->truesize += txp->size;
64620 +       }
64621 +}
64622 +
64623 +int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
64624 +                    int work_to_do)
64625 +{
64626 +       struct netif_extra_info extra;
64627 +       RING_IDX cons = netif->tx.req_cons;
64628 +
64629 +       do {
64630 +               if (unlikely(work_to_do-- <= 0)) {
64631 +                       DPRINTK("Missing extra info\n");
64632 +                       return -EBADR;
64633 +               }
64634 +
64635 +               memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
64636 +                      sizeof(extra));
64637 +               if (unlikely(!extra.type ||
64638 +                            extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
64639 +                       netif->tx.req_cons = ++cons;
64640 +                       DPRINTK("Invalid extra type: %d\n", extra.type);
64641 +                       return -EINVAL;
64642 +               }
64643 +
64644 +               memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
64645 +               netif->tx.req_cons = ++cons;
64646 +       } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
64647 +
64648 +       return work_to_do;
64649 +}
64650 +
64651 +static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
64652 +{
64653 +       if (!gso->u.gso.size) {
64654 +               DPRINTK("GSO size must not be zero.\n");
64655 +               return -EINVAL;
64656 +       }
64657 +
64658 +       /* Currently only TCPv4 S.O. is supported. */
64659 +       if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
64660 +               DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
64661 +               return -EINVAL;
64662 +       }
64663 +
64664 +       skb_shinfo(skb)->gso_size = gso->u.gso.size;
64665 +       skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
64666 +
64667 +       /* Header must be checked, and gso_segs computed. */
64668 +       skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
64669 +       skb_shinfo(skb)->gso_segs = 0;
64670 +
64671 +       return 0;
64672 +}
64673 +
64674 +/* Called after netfront has transmitted */
64675 +static void net_tx_action(unsigned long unused)
64676 +{
64677 +       struct list_head *ent;
64678 +       struct sk_buff *skb;
64679 +       netif_t *netif;
64680 +       netif_tx_request_t txreq;
64681 +       netif_tx_request_t txfrags[MAX_SKB_FRAGS];
64682 +       struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
64683 +       u16 pending_idx;
64684 +       RING_IDX i;
64685 +       gnttab_map_grant_ref_t *mop;
64686 +       unsigned int data_len;
64687 +       int ret, work_to_do;
64688 +
64689 +       if (dealloc_cons != dealloc_prod)
64690 +               net_tx_action_dealloc();
64691 +
64692 +       mop = tx_map_ops;
64693 +       while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
64694 +               !list_empty(&net_schedule_list)) {
64695 +               /* Get a netif from the list with work to do. */
64696 +               ent = net_schedule_list.next;
64697 +               netif = list_entry(ent, netif_t, list);
64698 +               netif_get(netif);
64699 +               remove_from_net_schedule_list(netif);
64700 +
64701 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
64702 +               if (!work_to_do) {
64703 +                       netif_put(netif);
64704 +                       continue;
64705 +               }
64706 +
64707 +               i = netif->tx.req_cons;
64708 +               rmb(); /* Ensure that we see the request before we copy it. */
64709 +               memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
64710 +
64711 +               /* Credit-based scheduling. */
64712 +               if (txreq.size > netif->remaining_credit) {
64713 +                       unsigned long now = jiffies;
64714 +                       unsigned long next_credit = 
64715 +                               netif->credit_timeout.expires +
64716 +                               msecs_to_jiffies(netif->credit_usec / 1000);
64717 +
64718 +                       /* Timer could already be pending in rare cases. */
64719 +                       if (timer_pending(&netif->credit_timeout)) {
64720 +                               netif_put(netif);
64721 +                               continue;
64722 +                       }
64723 +
64724 +                       /* Passed the point where we can replenish credit? */
64725 +                       if (time_after_eq(now, next_credit)) {
64726 +                               netif->credit_timeout.expires = now;
64727 +                               tx_add_credit(netif);
64728 +                       }
64729 +
64730 +                       /* Still too big to send right now? Set a callback. */
64731 +                       if (txreq.size > netif->remaining_credit) {
64732 +                               netif->credit_timeout.data     =
64733 +                                       (unsigned long)netif;
64734 +                               netif->credit_timeout.function =
64735 +                                       tx_credit_callback;
64736 +                               __mod_timer(&netif->credit_timeout,
64737 +                                           next_credit);
64738 +                               netif_put(netif);
64739 +                               continue;
64740 +                       }
64741 +               }
64742 +               netif->remaining_credit -= txreq.size;
64743 +
64744 +               work_to_do--;
64745 +               netif->tx.req_cons = ++i;
64746 +
64747 +               memset(extras, 0, sizeof(extras));
64748 +               if (txreq.flags & NETTXF_extra_info) {
64749 +                       work_to_do = netbk_get_extras(netif, extras,
64750 +                                                     work_to_do);
64751 +                       i = netif->tx.req_cons;
64752 +                       if (unlikely(work_to_do < 0)) {
64753 +                               netbk_tx_err(netif, &txreq, i);
64754 +                               continue;
64755 +                       }
64756 +               }
64757 +
64758 +               ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
64759 +               if (unlikely(ret < 0)) {
64760 +                       netbk_tx_err(netif, &txreq, i - ret);
64761 +                       continue;
64762 +               }
64763 +               i += ret;
64764 +
64765 +               if (unlikely(txreq.size < ETH_HLEN)) {
64766 +                       DPRINTK("Bad packet size: %d\n", txreq.size);
64767 +                       netbk_tx_err(netif, &txreq, i);
64768 +                       continue;
64769 +               }
64770 +
64771 +               /* No crossing a page as the payload mustn't fragment. */
64772 +               if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
64773 +                       DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
64774 +                               txreq.offset, txreq.size, 
64775 +                               (txreq.offset &~PAGE_MASK) + txreq.size);
64776 +                       netbk_tx_err(netif, &txreq, i);
64777 +                       continue;
64778 +               }
64779 +
64780 +               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
64781 +
64782 +               data_len = (txreq.size > PKT_PROT_LEN &&
64783 +                           ret < MAX_SKB_FRAGS) ?
64784 +                       PKT_PROT_LEN : txreq.size;
64785 +
64786 +               skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
64787 +                               GFP_ATOMIC | __GFP_NOWARN);
64788 +               if (unlikely(skb == NULL)) {
64789 +                       DPRINTK("Can't allocate a skb in start_xmit.\n");
64790 +                       netbk_tx_err(netif, &txreq, i);
64791 +                       break;
64792 +               }
64793 +
64794 +               /* Packets passed to netif_rx() must have some headroom. */
64795 +               skb_reserve(skb, 16 + NET_IP_ALIGN);
64796 +
64797 +               if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
64798 +                       struct netif_extra_info *gso;
64799 +                       gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
64800 +
64801 +                       if (netbk_set_skb_gso(skb, gso)) {
64802 +                               kfree_skb(skb);
64803 +                               netbk_tx_err(netif, &txreq, i);
64804 +                               continue;
64805 +                       }
64806 +               }
64807 +
64808 +               gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
64809 +                                 GNTMAP_host_map | GNTMAP_readonly,
64810 +                                 txreq.gref, netif->domid);
64811 +               mop++;
64812 +
64813 +               memcpy(&pending_tx_info[pending_idx].req,
64814 +                      &txreq, sizeof(txreq));
64815 +               pending_tx_info[pending_idx].netif = netif;
64816 +               *((u16 *)skb->data) = pending_idx;
64817 +
64818 +               __skb_put(skb, data_len);
64819 +
64820 +               skb_shinfo(skb)->nr_frags = ret;
64821 +               if (data_len < txreq.size) {
64822 +                       skb_shinfo(skb)->nr_frags++;
64823 +                       skb_shinfo(skb)->frags[0].page =
64824 +                               (void *)(unsigned long)pending_idx;
64825 +               } else {
64826 +                       /* Discriminate from any valid pending_idx value. */
64827 +                       skb_shinfo(skb)->frags[0].page = (void *)~0UL;
64828 +               }
64829 +
64830 +               __skb_queue_tail(&tx_queue, skb);
64831 +
64832 +               pending_cons++;
64833 +
64834 +               mop = netbk_get_requests(netif, skb, txfrags, mop);
64835 +
64836 +               netif->tx.req_cons = i;
64837 +               netif_schedule_work(netif);
64838 +
64839 +               if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
64840 +                       break;
64841 +       }
64842 +
64843 +       if (mop == tx_map_ops)
64844 +               return;
64845 +
64846 +       ret = HYPERVISOR_grant_table_op(
64847 +               GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
64848 +       BUG_ON(ret);
64849 +
64850 +       mop = tx_map_ops;
64851 +       while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
64852 +               netif_tx_request_t *txp;
64853 +
64854 +               pending_idx = *((u16 *)skb->data);
64855 +               netif       = pending_tx_info[pending_idx].netif;
64856 +               txp         = &pending_tx_info[pending_idx].req;
64857 +
64858 +               /* Check the remap error code. */
64859 +               if (unlikely(netbk_tx_check_mop(skb, &mop))) {
64860 +                       printk(KERN_ALERT "#### netback grant fails\n");
64861 +                       skb_shinfo(skb)->nr_frags = 0;
64862 +                       kfree_skb(skb);
64863 +                       continue;
64864 +               }
64865 +
64866 +               data_len = skb->len;
64867 +               memcpy(skb->data,
64868 +                      (void *)(idx_to_kaddr(pending_idx)|txp->offset),
64869 +                      data_len);
64870 +               if (data_len < txp->size) {
64871 +                       /* Append the packet payload as a fragment. */
64872 +                       txp->offset += data_len;
64873 +                       txp->size -= data_len;
64874 +               } else {
64875 +                       /* Schedule a response immediately. */
64876 +                       netif_idx_release(pending_idx);
64877 +               }
64878 +
64879 +               /*
64880 +                * Old frontends do not assert data_validated but we
64881 +                * can infer it from csum_blank so test both flags.
64882 +                */
64883 +               if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
64884 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
64885 +                       skb->proto_data_valid = 1;
64886 +               } else {
64887 +                       skb->ip_summed = CHECKSUM_NONE;
64888 +                       skb->proto_data_valid = 0;
64889 +               }
64890 +               skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
64891 +
64892 +               netbk_fill_frags(skb);
64893 +
64894 +               skb->dev      = netif->dev;
64895 +               skb->protocol = eth_type_trans(skb, skb->dev);
64896 +
64897 +               netif->stats.rx_bytes += skb->len;
64898 +               netif->stats.rx_packets++;
64899 +
64900 +               netif_rx(skb);
64901 +               netif->dev->last_rx = jiffies;
64902 +       }
64903 +}
64904 +
64905 +static void netif_idx_release(u16 pending_idx)
64906 +{
64907 +       static DEFINE_SPINLOCK(_lock);
64908 +       unsigned long flags;
64909 +
64910 +       spin_lock_irqsave(&_lock, flags);
64911 +       dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
64912 +       /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
64913 +       smp_wmb();
64914 +       dealloc_prod++;
64915 +       spin_unlock_irqrestore(&_lock, flags);
64916 +
64917 +       tasklet_schedule(&net_tx_tasklet);
64918 +}
64919 +
64920 +static void netif_page_release(struct page *page)
64921 +{
64922 +       /* Ready for next use. */
64923 +       init_page_count(page);
64924 +
64925 +       netif_idx_release(page->index);
64926 +}
64927 +
64928 +irqreturn_t netif_be_int(int irq, void *dev_id)
64929 +{
64930 +       netif_t *netif = dev_id;
64931 +
64932 +       add_to_net_schedule_list_tail(netif);
64933 +       maybe_schedule_tx_action();
64934 +
64935 +       if (netif_schedulable(netif->dev) && !netbk_queue_full(netif))
64936 +               netif_wake_queue(netif->dev);
64937 +
64938 +       return IRQ_HANDLED;
64939 +}
64940 +
64941 +static void make_tx_response(netif_t *netif, 
64942 +                            netif_tx_request_t *txp,
64943 +                            s8       st)
64944 +{
64945 +       RING_IDX i = netif->tx.rsp_prod_pvt;
64946 +       netif_tx_response_t *resp;
64947 +       int notify;
64948 +
64949 +       resp = RING_GET_RESPONSE(&netif->tx, i);
64950 +       resp->id     = txp->id;
64951 +       resp->status = st;
64952 +
64953 +       if (txp->flags & NETTXF_extra_info)
64954 +               RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
64955 +
64956 +       netif->tx.rsp_prod_pvt = ++i;
64957 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
64958 +       if (notify)
64959 +               notify_remote_via_irq(netif->irq);
64960 +
64961 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
64962 +       if (i == netif->tx.req_cons) {
64963 +               int more_to_do;
64964 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
64965 +               if (more_to_do)
64966 +                       add_to_net_schedule_list_tail(netif);
64967 +       }
64968 +#endif
64969 +}
64970 +
64971 +static netif_rx_response_t *make_rx_response(netif_t *netif, 
64972 +                                            u16      id, 
64973 +                                            s8       st,
64974 +                                            u16      offset,
64975 +                                            u16      size,
64976 +                                            u16      flags)
64977 +{
64978 +       RING_IDX i = netif->rx.rsp_prod_pvt;
64979 +       netif_rx_response_t *resp;
64980 +
64981 +       resp = RING_GET_RESPONSE(&netif->rx, i);
64982 +       resp->offset     = offset;
64983 +       resp->flags      = flags;
64984 +       resp->id         = id;
64985 +       resp->status     = (s16)size;
64986 +       if (st < 0)
64987 +               resp->status = (s16)st;
64988 +
64989 +       netif->rx.rsp_prod_pvt = ++i;
64990 +
64991 +       return resp;
64992 +}
64993 +
64994 +#ifdef NETBE_DEBUG_INTERRUPT
64995 +static irqreturn_t netif_be_dbg(int irq, void *dev_id)
64996 +{
64997 +       struct list_head *ent;
64998 +       netif_t *netif;
64999 +       int i = 0;
65000 +
65001 +       printk(KERN_ALERT "netif_schedule_list:\n");
65002 +       spin_lock_irq(&net_schedule_list_lock);
65003 +
65004 +       list_for_each (ent, &net_schedule_list) {
65005 +               netif = list_entry(ent, netif_t, list);
65006 +               printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
65007 +                      "rx_resp_prod=%08x\n",
65008 +                      i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
65009 +               printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
65010 +                      netif->tx.req_cons, netif->tx.rsp_prod_pvt);
65011 +               printk(KERN_ALERT "   shared(rx_req_prod=%08x "
65012 +                      "rx_resp_prod=%08x\n",
65013 +                      netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
65014 +               printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
65015 +                      netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
65016 +               printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
65017 +                      netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
65018 +               i++;
65019 +       }
65020 +
65021 +       spin_unlock_irq(&net_schedule_list_lock);
65022 +       printk(KERN_ALERT " ** End of netif_schedule_list **\n");
65023 +
65024 +       return IRQ_HANDLED;
65025 +}
65026 +#endif
65027 +
65028 +static int __init netback_init(void)
65029 +{
65030 +       int i;
65031 +       struct page *page;
65032 +
65033 +       if (!is_running_on_xen())
65034 +               return -ENODEV;
65035 +
65036 +       /* We can increase reservation by this much in net_rx_action(). */
65037 +       balloon_update_driver_allowance(NET_RX_RING_SIZE);
65038 +
65039 +       skb_queue_head_init(&rx_queue);
65040 +       skb_queue_head_init(&tx_queue);
65041 +
65042 +       init_timer(&net_timer);
65043 +       net_timer.data = 0;
65044 +       net_timer.function = net_alarm;
65045 +
65046 +       mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
65047 +       if (mmap_pages == NULL) {
65048 +               printk("%s: out of memory\n", __FUNCTION__);
65049 +               return -ENOMEM;
65050 +       }
65051 +
65052 +       for (i = 0; i < MAX_PENDING_REQS; i++) {
65053 +               page = mmap_pages[i];
65054 +               SetPageForeign(page, netif_page_release);
65055 +               page->index = i;
65056 +       }
65057 +
65058 +       pending_cons = 0;
65059 +       pending_prod = MAX_PENDING_REQS;
65060 +       for (i = 0; i < MAX_PENDING_REQS; i++)
65061 +               pending_ring[i] = i;
65062 +
65063 +       spin_lock_init(&net_schedule_list_lock);
65064 +       INIT_LIST_HEAD(&net_schedule_list);
65065 +
65066 +       netif_xenbus_init();
65067 +
65068 +#ifdef NETBE_DEBUG_INTERRUPT
65069 +       (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
65070 +                                     0,
65071 +                                     netif_be_dbg,
65072 +                                     SA_SHIRQ, 
65073 +                                     "net-be-dbg",
65074 +                                     &netif_be_dbg);
65075 +#endif
65076 +
65077 +       return 0;
65078 +}
65079 +
65080 +module_init(netback_init);
65081 +
65082 +MODULE_LICENSE("Dual BSD/GPL");
65083 diff -ruNp linux-2.6.19/drivers/xen/netback/xenbus.c linux-2.6.19-xen-3.0.4/drivers/xen/netback/xenbus.c
65084 --- linux-2.6.19/drivers/xen/netback/xenbus.c   1970-01-01 00:00:00.000000000 +0000
65085 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netback/xenbus.c 2007-02-02 19:10:45.000000000 +0000
65086 @@ -0,0 +1,450 @@
65087 +/*  Xenbus code for netif backend
65088 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
65089 +    Copyright (C) 2005 XenSource Ltd
65090 +
65091 +    This program is free software; you can redistribute it and/or modify
65092 +    it under the terms of the GNU General Public License as published by
65093 +    the Free Software Foundation; either version 2 of the License, or
65094 +    (at your option) any later version.
65095 +
65096 +    This program is distributed in the hope that it will be useful,
65097 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
65098 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
65099 +    GNU General Public License for more details.
65100 +
65101 +    You should have received a copy of the GNU General Public License
65102 +    along with this program; if not, write to the Free Software
65103 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
65104 +*/
65105 +
65106 +#include <stdarg.h>
65107 +#include <linux/module.h>
65108 +#include <xen/xenbus.h>
65109 +#include "common.h"
65110 +
65111 +#if 0
65112 +#undef DPRINTK
65113 +#define DPRINTK(fmt, args...) \
65114 +    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
65115 +#endif
65116 +
65117 +struct backend_info {
65118 +       struct xenbus_device *dev;
65119 +       netif_t *netif;
65120 +       enum xenbus_state frontend_state;
65121 +};
65122 +
65123 +static int connect_rings(struct backend_info *);
65124 +static void connect(struct backend_info *);
65125 +static void backend_create_netif(struct backend_info *be);
65126 +
65127 +static int netback_remove(struct xenbus_device *dev)
65128 +{
65129 +       struct backend_info *be = dev->dev.driver_data;
65130 +
65131 +       if (be->netif) {
65132 +               netif_disconnect(be->netif);
65133 +               be->netif = NULL;
65134 +       }
65135 +       kfree(be);
65136 +       dev->dev.driver_data = NULL;
65137 +       return 0;
65138 +}
65139 +
65140 +
65141 +/**
65142 + * Entry point to this code when a new device is created.  Allocate the basic
65143 + * structures and switch to InitWait.
65144 + */
65145 +static int netback_probe(struct xenbus_device *dev,
65146 +                        const struct xenbus_device_id *id)
65147 +{
65148 +       const char *message;
65149 +       struct xenbus_transaction xbt;
65150 +       int err;
65151 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
65152 +                                         GFP_KERNEL);
65153 +       if (!be) {
65154 +               xenbus_dev_fatal(dev, -ENOMEM,
65155 +                                "allocating backend structure");
65156 +               return -ENOMEM;
65157 +       }
65158 +
65159 +       be->dev = dev;
65160 +       dev->dev.driver_data = be;
65161 +
65162 +       do {
65163 +               err = xenbus_transaction_start(&xbt);
65164 +               if (err) {
65165 +                       xenbus_dev_fatal(dev, err, "starting transaction");
65166 +                       goto fail;
65167 +               }
65168 +
65169 +               err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
65170 +               if (err) {
65171 +                       message = "writing feature-sg";
65172 +                       goto abort_transaction;
65173 +               }
65174 +
65175 +               err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
65176 +                                   "%d", 1);
65177 +               if (err) {
65178 +                       message = "writing feature-gso-tcpv4";
65179 +                       goto abort_transaction;
65180 +               }
65181 +
65182 +               /* We support rx-copy path. */
65183 +               err = xenbus_printf(xbt, dev->nodename,
65184 +                                   "feature-rx-copy", "%d", 1);
65185 +               if (err) {
65186 +                       message = "writing feature-rx-copy";
65187 +                       goto abort_transaction;
65188 +               }
65189 +
65190 +               /*
65191 +                * We don't support rx-flip path (except old guests who don't
65192 +                * grok this feature flag).
65193 +                */
65194 +               err = xenbus_printf(xbt, dev->nodename,
65195 +                                   "feature-rx-flip", "%d", 0);
65196 +               if (err) {
65197 +                       message = "writing feature-rx-flip";
65198 +                       goto abort_transaction;
65199 +               }
65200 +
65201 +               err = xenbus_transaction_end(xbt, 0);
65202 +       } while (err == -EAGAIN);
65203 +
65204 +       if (err) {
65205 +               xenbus_dev_fatal(dev, err, "completing transaction");
65206 +               goto fail;
65207 +       }
65208 +
65209 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
65210 +       if (err)
65211 +               goto fail;
65212 +
65213 +       /* This kicks hotplug scripts, so do it immediately. */
65214 +       backend_create_netif(be);
65215 +
65216 +       return 0;
65217 +
65218 +abort_transaction:
65219 +       xenbus_transaction_end(xbt, 1);
65220 +       xenbus_dev_fatal(dev, err, "%s", message);
65221 +fail:
65222 +       DPRINTK("failed");
65223 +       netback_remove(dev);
65224 +       return err;
65225 +}
65226 +
65227 +
65228 +/**
65229 + * Handle the creation of the hotplug script environment.  We add the script
65230 + * and vif variables to the environment, for the benefit of the vif-* hotplug
65231 + * scripts.
65232 + */
65233 +static int netback_uevent(struct xenbus_device *xdev, char **envp,
65234 +                         int num_envp, char *buffer, int buffer_size)
65235 +{
65236 +       struct backend_info *be = xdev->dev.driver_data;
65237 +       netif_t *netif = be->netif;
65238 +       int i = 0, length = 0;
65239 +       char *val;
65240 +
65241 +       DPRINTK("netback_uevent");
65242 +
65243 +       val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
65244 +       if (IS_ERR(val)) {
65245 +               int err = PTR_ERR(val);
65246 +               xenbus_dev_fatal(xdev, err, "reading script");
65247 +               return err;
65248 +       }
65249 +       else {
65250 +               add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
65251 +                              &length, "script=%s", val);
65252 +               kfree(val);
65253 +       }
65254 +
65255 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
65256 +                      "vif=%s", netif->dev->name);
65257 +
65258 +       envp[i] = NULL;
65259 +
65260 +       return 0;
65261 +}
65262 +
65263 +
65264 +static void backend_create_netif(struct backend_info *be)
65265 +{
65266 +       int err;
65267 +       long handle;
65268 +       struct xenbus_device *dev = be->dev;
65269 +
65270 +       if (be->netif != NULL)
65271 +               return;
65272 +
65273 +       err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
65274 +       if (err != 1) {
65275 +               xenbus_dev_fatal(dev, err, "reading handle");
65276 +               return;
65277 +       }
65278 +
65279 +       be->netif = netif_alloc(dev->otherend_id, handle);
65280 +       if (IS_ERR(be->netif)) {
65281 +               err = PTR_ERR(be->netif);
65282 +               be->netif = NULL;
65283 +               xenbus_dev_fatal(dev, err, "creating interface");
65284 +               return;
65285 +       }
65286 +
65287 +       kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
65288 +}
65289 +
65290 +
65291 +/**
65292 + * Callback received when the frontend's state changes.
65293 + */
65294 +static void frontend_changed(struct xenbus_device *dev,
65295 +                            enum xenbus_state frontend_state)
65296 +{
65297 +       struct backend_info *be = dev->dev.driver_data;
65298 +
65299 +       DPRINTK("%s", xenbus_strstate(frontend_state));
65300 +
65301 +       be->frontend_state = frontend_state;
65302 +
65303 +       switch (frontend_state) {
65304 +       case XenbusStateInitialising:
65305 +               if (dev->state == XenbusStateClosed) {
65306 +                       printk("%s: %s: prepare for reconnect\n",
65307 +                              __FUNCTION__, dev->nodename);
65308 +                       if (be->netif) {
65309 +                               netif_disconnect(be->netif);
65310 +                               be->netif = NULL;
65311 +                       }
65312 +                       xenbus_switch_state(dev, XenbusStateInitWait);
65313 +               }
65314 +               break;
65315 +
65316 +       case XenbusStateInitialised:
65317 +               break;
65318 +
65319 +       case XenbusStateConnected:
65320 +               backend_create_netif(be);
65321 +               if (be->netif)
65322 +                       connect(be);
65323 +               break;
65324 +
65325 +       case XenbusStateClosing:
65326 +               xenbus_switch_state(dev, XenbusStateClosing);
65327 +               break;
65328 +
65329 +       case XenbusStateClosed:
65330 +               xenbus_switch_state(dev, XenbusStateClosed);
65331 +               if (xenbus_dev_is_online(dev))
65332 +                       break;
65333 +               /* fall through if not online */
65334 +       case XenbusStateUnknown:
65335 +               if (be->netif != NULL)
65336 +                       kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
65337 +               device_unregister(&dev->dev);
65338 +               break;
65339 +
65340 +       default:
65341 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
65342 +                                frontend_state);
65343 +               break;
65344 +       }
65345 +}
65346 +
65347 +
65348 +static void xen_net_read_rate(struct xenbus_device *dev,
65349 +                             unsigned long *bytes, unsigned long *usec)
65350 +{
65351 +       char *s, *e;
65352 +       unsigned long b, u;
65353 +       char *ratestr;
65354 +
65355 +       /* Default to unlimited bandwidth. */
65356 +       *bytes = ~0UL;
65357 +       *usec = 0;
65358 +
65359 +       ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
65360 +       if (IS_ERR(ratestr))
65361 +               return;
65362 +
65363 +       s = ratestr;
65364 +       b = simple_strtoul(s, &e, 10);
65365 +       if ((s == e) || (*e != ','))
65366 +               goto fail;
65367 +
65368 +       s = e + 1;
65369 +       u = simple_strtoul(s, &e, 10);
65370 +       if ((s == e) || (*e != '\0'))
65371 +               goto fail;
65372 +
65373 +       *bytes = b;
65374 +       *usec = u;
65375 +
65376 +       kfree(ratestr);
65377 +       return;
65378 +
65379 + fail:
65380 +       WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
65381 +       kfree(ratestr);
65382 +}
65383 +
65384 +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
65385 +{
65386 +       char *s, *e, *macstr;
65387 +       int i;
65388 +
65389 +       macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
65390 +       if (IS_ERR(macstr))
65391 +               return PTR_ERR(macstr);
65392 +
65393 +       for (i = 0; i < ETH_ALEN; i++) {
65394 +               mac[i] = simple_strtoul(s, &e, 16);
65395 +               if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
65396 +                       kfree(macstr);
65397 +                       return -ENOENT;
65398 +               }
65399 +               s = e+1;
65400 +       }
65401 +
65402 +       kfree(macstr);
65403 +       return 0;
65404 +}
65405 +
65406 +static void connect(struct backend_info *be)
65407 +{
65408 +       int err;
65409 +       struct xenbus_device *dev = be->dev;
65410 +
65411 +       err = connect_rings(be);
65412 +       if (err)
65413 +               return;
65414 +
65415 +       err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
65416 +       if (err) {
65417 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
65418 +               return;
65419 +       }
65420 +
65421 +       xen_net_read_rate(dev, &be->netif->credit_bytes,
65422 +                         &be->netif->credit_usec);
65423 +       be->netif->remaining_credit = be->netif->credit_bytes;
65424 +
65425 +       xenbus_switch_state(dev, XenbusStateConnected);
65426 +
65427 +       /* May not get a kick from the frontend, so start the tx_queue now. */
65428 +       if (!netbk_can_queue(be->netif->dev))
65429 +               netif_wake_queue(be->netif->dev);
65430 +}
65431 +
65432 +
65433 +static int connect_rings(struct backend_info *be)
65434 +{
65435 +       struct xenbus_device *dev = be->dev;
65436 +       unsigned long tx_ring_ref, rx_ring_ref;
65437 +       unsigned int evtchn, rx_copy;
65438 +       int err;
65439 +       int val;
65440 +
65441 +       DPRINTK("");
65442 +
65443 +       err = xenbus_gather(XBT_NIL, dev->otherend,
65444 +                           "tx-ring-ref", "%lu", &tx_ring_ref,
65445 +                           "rx-ring-ref", "%lu", &rx_ring_ref,
65446 +                           "event-channel", "%u", &evtchn, NULL);
65447 +       if (err) {
65448 +               xenbus_dev_fatal(dev, err,
65449 +                                "reading %s/ring-ref and event-channel",
65450 +                                dev->otherend);
65451 +               return err;
65452 +       }
65453 +
65454 +       err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
65455 +                          &rx_copy);
65456 +       if (err == -ENOENT) {
65457 +               err = 0;
65458 +               rx_copy = 0;
65459 +       }
65460 +       if (err < 0) {
65461 +               xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
65462 +                                dev->otherend);
65463 +               return err;
65464 +       }
65465 +       be->netif->copying_receiver = !!rx_copy;
65466 +
65467 +       if (be->netif->dev->tx_queue_len != 0) {
65468 +               if (xenbus_scanf(XBT_NIL, dev->otherend,
65469 +                                "feature-rx-notify", "%d", &val) < 0)
65470 +                       val = 0;
65471 +               if (val)
65472 +                       be->netif->can_queue = 1;
65473 +               else
65474 +                       /* Must be non-zero for pfifo_fast to work. */
65475 +                       be->netif->dev->tx_queue_len = 1;
65476 +       }
65477 +
65478 +       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
65479 +               val = 0;
65480 +       if (val) {
65481 +               be->netif->features |= NETIF_F_SG;
65482 +               be->netif->dev->features |= NETIF_F_SG;
65483 +       }
65484 +
65485 +       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
65486 +                        &val) < 0)
65487 +               val = 0;
65488 +       if (val) {
65489 +               be->netif->features |= NETIF_F_TSO;
65490 +               be->netif->dev->features |= NETIF_F_TSO;
65491 +       }
65492 +
65493 +       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
65494 +                        "%d", &val) < 0)
65495 +               val = 0;
65496 +       if (val) {
65497 +               be->netif->features &= ~NETIF_F_IP_CSUM;
65498 +               be->netif->dev->features &= ~NETIF_F_IP_CSUM;
65499 +       }
65500 +
65501 +       /* Map the shared frame, irq etc. */
65502 +       err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
65503 +       if (err) {
65504 +               xenbus_dev_fatal(dev, err,
65505 +                                "mapping shared-frames %lu/%lu port %u",
65506 +                                tx_ring_ref, rx_ring_ref, evtchn);
65507 +               return err;
65508 +       }
65509 +       return 0;
65510 +}
65511 +
65512 +
65513 +/* ** Driver Registration ** */
65514 +
65515 +
65516 +static struct xenbus_device_id netback_ids[] = {
65517 +       { "vif" },
65518 +       { "" }
65519 +};
65520 +
65521 +
65522 +static struct xenbus_driver netback = {
65523 +       .name = "vif",
65524 +       .owner = THIS_MODULE,
65525 +       .ids = netback_ids,
65526 +       .probe = netback_probe,
65527 +       .remove = netback_remove,
65528 +       .uevent = netback_uevent,
65529 +       .otherend_changed = frontend_changed,
65530 +};
65531 +
65532 +
65533 +void netif_xenbus_init(void)
65534 +{
65535 +       xenbus_register_backend(&netback);
65536 +}
65537 diff -ruNp linux-2.6.19/drivers/xen/netfront/Kconfig linux-2.6.19-xen-3.0.4/drivers/xen/netfront/Kconfig
65538 --- linux-2.6.19/drivers/xen/netfront/Kconfig   1970-01-01 00:00:00.000000000 +0000
65539 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netfront/Kconfig 2007-02-02 19:10:45.000000000 +0000
65540 @@ -0,0 +1,6 @@
65541 +
65542 +config XENNET
65543 +       tristate "Xen network driver"
65544 +       depends on NETDEVICES && ARCH_XEN
65545 +       help
65546 +         Network driver for Xen
65547 diff -ruNp linux-2.6.19/drivers/xen/netfront/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/netfront/Makefile
65548 --- linux-2.6.19/drivers/xen/netfront/Makefile  1970-01-01 00:00:00.000000000 +0000
65549 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netfront/Makefile        2007-02-02 19:10:45.000000000 +0000
65550 @@ -0,0 +1,4 @@
65551 +
65552 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      := xennet.o
65553 +
65554 +xennet-objs := netfront.o
65555 diff -ruNp linux-2.6.19/drivers/xen/netfront/netfront.c linux-2.6.19-xen-3.0.4/drivers/xen/netfront/netfront.c
65556 --- linux-2.6.19/drivers/xen/netfront/netfront.c        1970-01-01 00:00:00.000000000 +0000
65557 +++ linux-2.6.19-xen-3.0.4/drivers/xen/netfront/netfront.c      2007-02-02 19:10:45.000000000 +0000
65558 @@ -0,0 +1,2114 @@
65559 +/******************************************************************************
65560 + * Virtual network driver for conversing with remote driver backends.
65561 + *
65562 + * Copyright (c) 2002-2005, K A Fraser
65563 + * Copyright (c) 2005, XenSource Ltd
65564 + *
65565 + * This program is free software; you can redistribute it and/or
65566 + * modify it under the terms of the GNU General Public License version 2
65567 + * as published by the Free Software Foundation; or, when distributed
65568 + * separately from the Linux kernel or incorporated into other
65569 + * software packages, subject to the following license:
65570 + *
65571 + * Permission is hereby granted, free of charge, to any person obtaining a copy
65572 + * of this source file (the "Software"), to deal in the Software without
65573 + * restriction, including without limitation the rights to use, copy, modify,
65574 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
65575 + * and to permit persons to whom the Software is furnished to do so, subject to
65576 + * the following conditions:
65577 + *
65578 + * The above copyright notice and this permission notice shall be included in
65579 + * all copies or substantial portions of the Software.
65580 + *
65581 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65582 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
65583 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
65584 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
65585 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
65586 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
65587 + * IN THE SOFTWARE.
65588 + */
65589 +
65590 +#include <linux/module.h>
65591 +#include <linux/version.h>
65592 +#include <linux/kernel.h>
65593 +#include <linux/sched.h>
65594 +#include <linux/slab.h>
65595 +#include <linux/string.h>
65596 +#include <linux/errno.h>
65597 +#include <linux/netdevice.h>
65598 +#include <linux/inetdevice.h>
65599 +#include <linux/etherdevice.h>
65600 +#include <linux/skbuff.h>
65601 +#include <linux/init.h>
65602 +#include <linux/bitops.h>
65603 +#include <linux/ethtool.h>
65604 +#include <linux/in.h>
65605 +#include <linux/if_ether.h>
65606 +#include <linux/io.h>
65607 +#include <linux/moduleparam.h>
65608 +#include <net/sock.h>
65609 +#include <net/pkt_sched.h>
65610 +#include <net/arp.h>
65611 +#include <net/route.h>
65612 +#include <asm/hypercall.h>
65613 +#include <asm/uaccess.h>
65614 +#include <xen/evtchn.h>
65615 +#include <xen/xenbus.h>
65616 +#include <xen/interface/io/netif.h>
65617 +#include <xen/interface/memory.h>
65618 +#include <xen/balloon.h>
65619 +#include <asm/page.h>
65620 +#include <asm/maddr.h>
65621 +#include <asm/uaccess.h>
65622 +#include <xen/interface/grant_table.h>
65623 +#include <xen/gnttab.h>
65624 +
65625 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
65626 +#include <xen/platform-compat.h>
65627 +#endif
65628 +
65629 +/*
65630 + * Mutually-exclusive module options to select receive data path:
65631 + *  rx_copy : Packets are copied by network backend into local memory
65632 + *  rx_flip : Page containing packet data is transferred to our ownership
65633 + * For fully-virtualised guests there is no option - copying must be used.
65634 + * For paravirtualised guests, flipping is the default.
65635 + */
65636 +#ifdef CONFIG_XEN
65637 +static int MODPARM_rx_copy = 0;
65638 +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
65639 +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
65640 +static int MODPARM_rx_flip = 0;
65641 +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
65642 +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
65643 +#else
65644 +static const int MODPARM_rx_copy = 1;
65645 +static const int MODPARM_rx_flip = 0;
65646 +#endif
65647 +
65648 +#define RX_COPY_THRESHOLD 256
65649 +
65650 +/* If we don't have GSO, fake things up so that we never try to use it. */
65651 +#if defined(NETIF_F_GSO)
65652 +#define HAVE_GSO                       1
65653 +#define HAVE_TSO                       1 /* TSO is a subset of GSO */
65654 +static inline void dev_disable_gso_features(struct net_device *dev)
65655 +{
65656 +       /* Turn off all GSO bits except ROBUST. */
65657 +       dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
65658 +       dev->features |= NETIF_F_GSO_ROBUST;
65659 +}
65660 +#elif defined(NETIF_F_TSO)
65661 +#define HAVE_TSO                       1
65662 +
65663 +/* Some older kernels cannot cope with incorrect checksums,
65664 + * particularly in netfilter. I'm not sure there is 100% correlation
65665 + * with the presence of NETIF_F_TSO but it appears to be a good first
65666 + * approximiation.
65667 + */
65668 +#define HAVE_NO_CSUM_OFFLOAD           1
65669 +
65670 +#define gso_size tso_size
65671 +#define gso_segs tso_segs
65672 +static inline void dev_disable_gso_features(struct net_device *dev)
65673 +{
65674 +       /* Turn off all TSO bits. */
65675 +       dev->features &= ~NETIF_F_TSO;
65676 +}
65677 +static inline int skb_is_gso(const struct sk_buff *skb)
65678 +{
65679 +        return skb_shinfo(skb)->tso_size;
65680 +}
65681 +static inline int skb_gso_ok(struct sk_buff *skb, int features)
65682 +{
65683 +        return (features & NETIF_F_TSO);
65684 +}
65685 +
65686 +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
65687 +{
65688 +        return skb_is_gso(skb) &&
65689 +               (!skb_gso_ok(skb, dev->features) ||
65690 +                unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
65691 +}
65692 +#else
65693 +#define netif_needs_gso(dev, skb)      0
65694 +#define dev_disable_gso_features(dev)  ((void)0)
65695 +#endif
65696 +
65697 +#define GRANT_INVALID_REF      0
65698 +
65699 +#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
65700 +#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
65701 +
65702 +struct netfront_info {
65703 +       struct list_head list;
65704 +       struct net_device *netdev;
65705 +
65706 +       struct net_device_stats stats;
65707 +
65708 +       struct netif_tx_front_ring tx;
65709 +       struct netif_rx_front_ring rx;
65710 +
65711 +       spinlock_t   tx_lock;
65712 +       spinlock_t   rx_lock;
65713 +
65714 +       unsigned int evtchn, irq;
65715 +       unsigned int copying_receiver;
65716 +
65717 +       /* Receive-ring batched refills. */
65718 +#define RX_MIN_TARGET 8
65719 +#define RX_DFL_MIN_TARGET 64
65720 +#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
65721 +       unsigned rx_min_target, rx_max_target, rx_target;
65722 +       struct sk_buff_head rx_batch;
65723 +
65724 +       struct timer_list rx_refill_timer;
65725 +
65726 +       /*
65727 +        * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
65728 +        * is an index into a chain of free entries.
65729 +        */
65730 +       struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
65731 +       struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
65732 +
65733 +#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
65734 +       grant_ref_t gref_tx_head;
65735 +       grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
65736 +       grant_ref_t gref_rx_head;
65737 +       grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
65738 +
65739 +       struct xenbus_device *xbdev;
65740 +       int tx_ring_ref;
65741 +       int rx_ring_ref;
65742 +       u8 mac[ETH_ALEN];
65743 +
65744 +       unsigned long rx_pfn_array[NET_RX_RING_SIZE];
65745 +       struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
65746 +       struct mmu_update rx_mmu[NET_RX_RING_SIZE];
65747 +};
65748 +
65749 +struct netfront_rx_info {
65750 +       struct netif_rx_response rx;
65751 +       struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
65752 +};
65753 +
65754 +/*
65755 + * Access macros for acquiring freeing slots in tx_skbs[].
65756 + */
65757 +
65758 +static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
65759 +{
65760 +       list[id] = list[0];
65761 +       list[0]  = (void *)(unsigned long)id;
65762 +}
65763 +
65764 +static inline unsigned short get_id_from_freelist(struct sk_buff **list)
65765 +{
65766 +       unsigned int id = (unsigned int)(unsigned long)list[0];
65767 +       list[0] = list[id];
65768 +       return id;
65769 +}
65770 +
65771 +static inline int xennet_rxidx(RING_IDX idx)
65772 +{
65773 +       return idx & (NET_RX_RING_SIZE - 1);
65774 +}
65775 +
65776 +static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
65777 +                                               RING_IDX ri)
65778 +{
65779 +       int i = xennet_rxidx(ri);
65780 +       struct sk_buff *skb = np->rx_skbs[i];
65781 +       np->rx_skbs[i] = NULL;
65782 +       return skb;
65783 +}
65784 +
65785 +static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
65786 +                                           RING_IDX ri)
65787 +{
65788 +       int i = xennet_rxidx(ri);
65789 +       grant_ref_t ref = np->grant_rx_ref[i];
65790 +       np->grant_rx_ref[i] = GRANT_INVALID_REF;
65791 +       return ref;
65792 +}
65793 +
65794 +#define DPRINTK(fmt, args...)                          \
65795 +       pr_debug("netfront (%s:%d) " fmt,               \
65796 +                __FUNCTION__, __LINE__, ##args)
65797 +#define IPRINTK(fmt, args...)                          \
65798 +       printk(KERN_INFO "netfront: " fmt, ##args)
65799 +#define WPRINTK(fmt, args...)                          \
65800 +       printk(KERN_WARNING "netfront: " fmt, ##args)
65801 +
65802 +static int setup_device(struct xenbus_device *, struct netfront_info *);
65803 +static struct net_device *create_netdev(struct xenbus_device *);
65804 +
65805 +static void end_access(int, void *);
65806 +static void netif_disconnect_backend(struct netfront_info *);
65807 +
65808 +static int network_connect(struct net_device *);
65809 +static void network_tx_buf_gc(struct net_device *);
65810 +static void network_alloc_rx_buffers(struct net_device *);
65811 +static int send_fake_arp(struct net_device *);
65812 +
65813 +static irqreturn_t netif_int(int irq, void *dev_id);
65814 +
65815 +#ifdef CONFIG_SYSFS
65816 +static int xennet_sysfs_addif(struct net_device *netdev);
65817 +static void xennet_sysfs_delif(struct net_device *netdev);
65818 +#else /* !CONFIG_SYSFS */
65819 +#define xennet_sysfs_addif(dev) (0)
65820 +#define xennet_sysfs_delif(dev) do { } while(0)
65821 +#endif
65822 +
65823 +static inline int xennet_can_sg(struct net_device *dev)
65824 +{
65825 +       return dev->features & NETIF_F_SG;
65826 +}
65827 +
65828 +/**
65829 + * Entry point to this code when a new device is created.  Allocate the basic
65830 + * structures and the ring buffers for communication with the backend, and
65831 + * inform the backend of the appropriate details for those.
65832 + */
65833 +static int __devinit netfront_probe(struct xenbus_device *dev,
65834 +                                   const struct xenbus_device_id *id)
65835 +{
65836 +       int err;
65837 +       struct net_device *netdev;
65838 +       struct netfront_info *info;
65839 +
65840 +       netdev = create_netdev(dev);
65841 +       if (IS_ERR(netdev)) {
65842 +               err = PTR_ERR(netdev);
65843 +               xenbus_dev_fatal(dev, err, "creating netdev");
65844 +               return err;
65845 +       }
65846 +
65847 +       info = netdev_priv(netdev);
65848 +       dev->dev.driver_data = info;
65849 +
65850 +       err = register_netdev(info->netdev);
65851 +       if (err) {
65852 +               printk(KERN_WARNING "%s: register_netdev err=%d\n",
65853 +                      __FUNCTION__, err);
65854 +               goto fail;
65855 +       }
65856 +
65857 +       err = xennet_sysfs_addif(info->netdev);
65858 +       if (err) {
65859 +               unregister_netdev(info->netdev);
65860 +               printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
65861 +                      __FUNCTION__, err);
65862 +               goto fail;
65863 +       }
65864 +
65865 +       return 0;
65866 +
65867 + fail:
65868 +       free_netdev(netdev);
65869 +       dev->dev.driver_data = NULL;
65870 +       return err;
65871 +}
65872 +
65873 +static int __devexit netfront_remove(struct xenbus_device *dev)
65874 +{
65875 +       struct netfront_info *info = dev->dev.driver_data;
65876 +
65877 +       DPRINTK("%s\n", dev->nodename);
65878 +
65879 +       netif_disconnect_backend(info);
65880 +
65881 +       del_timer_sync(&info->rx_refill_timer);
65882 +
65883 +       xennet_sysfs_delif(info->netdev);
65884 +
65885 +       unregister_netdev(info->netdev);
65886 +
65887 +       free_netdev(info->netdev);
65888 +
65889 +       return 0;
65890 +}
65891 +
65892 +/**
65893 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
65894 + * driver restart.  We tear down our netif structure and recreate it, but
65895 + * leave the device-layer structures intact so that this is transparent to the
65896 + * rest of the kernel.
65897 + */
65898 +static int netfront_resume(struct xenbus_device *dev)
65899 +{
65900 +       struct netfront_info *info = dev->dev.driver_data;
65901 +
65902 +       DPRINTK("%s\n", dev->nodename);
65903 +
65904 +       netif_disconnect_backend(info);
65905 +       return 0;
65906 +}
65907 +
65908 +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
65909 +{
65910 +       char *s, *e, *macstr;
65911 +       int i;
65912 +
65913 +       macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
65914 +       if (IS_ERR(macstr))
65915 +               return PTR_ERR(macstr);
65916 +
65917 +       for (i = 0; i < ETH_ALEN; i++) {
65918 +               mac[i] = simple_strtoul(s, &e, 16);
65919 +               if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
65920 +                       kfree(macstr);
65921 +                       return -ENOENT;
65922 +               }
65923 +               s = e+1;
65924 +       }
65925 +
65926 +       kfree(macstr);
65927 +       return 0;
65928 +}
65929 +
65930 +/* Common code used when first setting up, and when resuming. */
65931 +static int talk_to_backend(struct xenbus_device *dev,
65932 +                          struct netfront_info *info)
65933 +{
65934 +       const char *message;
65935 +       struct xenbus_transaction xbt;
65936 +       int err;
65937 +
65938 +       err = xen_net_read_mac(dev, info->mac);
65939 +       if (err) {
65940 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
65941 +               goto out;
65942 +       }
65943 +
65944 +       /* Create shared ring, alloc event channel. */
65945 +       err = setup_device(dev, info);
65946 +       if (err)
65947 +               goto out;
65948 +
65949 +again:
65950 +       err = xenbus_transaction_start(&xbt);
65951 +       if (err) {
65952 +               xenbus_dev_fatal(dev, err, "starting transaction");
65953 +               goto destroy_ring;
65954 +       }
65955 +
65956 +       err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
65957 +                           info->tx_ring_ref);
65958 +       if (err) {
65959 +               message = "writing tx ring-ref";
65960 +               goto abort_transaction;
65961 +       }
65962 +       err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
65963 +                           info->rx_ring_ref);
65964 +       if (err) {
65965 +               message = "writing rx ring-ref";
65966 +               goto abort_transaction;
65967 +       }
65968 +       err = xenbus_printf(xbt, dev->nodename,
65969 +                           "event-channel", "%u", info->evtchn);
65970 +       if (err) {
65971 +               message = "writing event-channel";
65972 +               goto abort_transaction;
65973 +       }
65974 +
65975 +       err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
65976 +                           info->copying_receiver);
65977 +       if (err) {
65978 +               message = "writing request-rx-copy";
65979 +               goto abort_transaction;
65980 +       }
65981 +
65982 +       err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
65983 +       if (err) {
65984 +               message = "writing feature-rx-notify";
65985 +               goto abort_transaction;
65986 +       }
65987 +
65988 +#ifdef HAVE_NO_CSUM_OFFLOAD
65989 +       err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", "%d", 1);
65990 +       if (err) {
65991 +               message = "writing feature-no-csum-offload";
65992 +               goto abort_transaction;
65993 +       }
65994 +#endif
65995 +
65996 +       err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
65997 +       if (err) {
65998 +               message = "writing feature-sg";
65999 +               goto abort_transaction;
66000 +       }
66001 +
66002 +#ifdef HAVE_TSO
66003 +       err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
66004 +       if (err) {
66005 +               message = "writing feature-gso-tcpv4";
66006 +               goto abort_transaction;
66007 +       }
66008 +#endif
66009 +
66010 +       err = xenbus_transaction_end(xbt, 0);
66011 +       if (err) {
66012 +               if (err == -EAGAIN)
66013 +                       goto again;
66014 +               xenbus_dev_fatal(dev, err, "completing transaction");
66015 +               goto destroy_ring;
66016 +       }
66017 +
66018 +       return 0;
66019 +
66020 + abort_transaction:
66021 +       xenbus_transaction_end(xbt, 1);
66022 +       xenbus_dev_fatal(dev, err, "%s", message);
66023 + destroy_ring:
66024 +       netif_disconnect_backend(info);
66025 + out:
66026 +       return err;
66027 +}
66028 +
66029 +static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
66030 +{
66031 +       struct netif_tx_sring *txs;
66032 +       struct netif_rx_sring *rxs;
66033 +       int err;
66034 +       struct net_device *netdev = info->netdev;
66035 +
66036 +       info->tx_ring_ref = GRANT_INVALID_REF;
66037 +       info->rx_ring_ref = GRANT_INVALID_REF;
66038 +       info->rx.sring = NULL;
66039 +       info->tx.sring = NULL;
66040 +       info->irq = 0;
66041 +
66042 +       txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
66043 +       if (!txs) {
66044 +               err = -ENOMEM;
66045 +               xenbus_dev_fatal(dev, err, "allocating tx ring page");
66046 +               goto fail;
66047 +       }
66048 +       SHARED_RING_INIT(txs);
66049 +       FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
66050 +
66051 +       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
66052 +       if (err < 0) {
66053 +               free_page((unsigned long)txs);
66054 +               goto fail;
66055 +       }
66056 +       info->tx_ring_ref = err;
66057 +
66058 +       rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
66059 +       if (!rxs) {
66060 +               err = -ENOMEM;
66061 +               xenbus_dev_fatal(dev, err, "allocating rx ring page");
66062 +               goto fail;
66063 +       }
66064 +       SHARED_RING_INIT(rxs);
66065 +       FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
66066 +
66067 +       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
66068 +       if (err < 0) {
66069 +               free_page((unsigned long)rxs);
66070 +               goto fail;
66071 +       }
66072 +       info->rx_ring_ref = err;
66073 +
66074 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
66075 +       if (err)
66076 +               goto fail;
66077 +
66078 +       memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
66079 +       err = bind_evtchn_to_irqhandler(info->evtchn, netif_int,
66080 +                                       SA_SAMPLE_RANDOM, netdev->name,
66081 +                                       netdev);
66082 +       if (err < 0)
66083 +               goto fail;
66084 +       info->irq = err;
66085 +       return 0;
66086 +
66087 + fail:
66088 +       return err;
66089 +}
66090 +
66091 +/**
66092 + * Callback received when the backend's state changes.
66093 + */
66094 +static void backend_changed(struct xenbus_device *dev,
66095 +                           enum xenbus_state backend_state)
66096 +{
66097 +       struct netfront_info *np = dev->dev.driver_data;
66098 +       struct net_device *netdev = np->netdev;
66099 +
66100 +       DPRINTK("%s\n", xenbus_strstate(backend_state));
66101 +
66102 +       switch (backend_state) {
66103 +       case XenbusStateInitialising:
66104 +       case XenbusStateInitialised:
66105 +       case XenbusStateConnected:
66106 +       case XenbusStateUnknown:
66107 +       case XenbusStateClosed:
66108 +               break;
66109 +
66110 +       case XenbusStateInitWait:
66111 +               if (dev->state != XenbusStateInitialising)
66112 +                       break;
66113 +               if (network_connect(netdev) != 0)
66114 +                       break;
66115 +               xenbus_switch_state(dev, XenbusStateConnected);
66116 +               (void)send_fake_arp(netdev);
66117 +               break;
66118 +
66119 +       case XenbusStateClosing:
66120 +               xenbus_frontend_closed(dev);
66121 +               break;
66122 +       }
66123 +}
66124 +
66125 +/** Send a packet on a net device to encourage switches to learn the
66126 + * MAC. We send a fake ARP request.
66127 + *
66128 + * @param dev device
66129 + * @return 0 on success, error code otherwise
66130 + */
66131 +static int send_fake_arp(struct net_device *dev)
66132 +{
66133 +       struct sk_buff *skb;
66134 +       u32             src_ip, dst_ip;
66135 +
66136 +       dst_ip = INADDR_BROADCAST;
66137 +       src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
66138 +
66139 +       /* No IP? Then nothing to do. */
66140 +       if (src_ip == 0)
66141 +               return 0;
66142 +
66143 +       skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
66144 +                        dst_ip, dev, src_ip,
66145 +                        /*dst_hw*/ NULL, /*src_hw*/ NULL,
66146 +                        /*target_hw*/ dev->dev_addr);
66147 +       if (skb == NULL)
66148 +               return -ENOMEM;
66149 +
66150 +       return dev_queue_xmit(skb);
66151 +}
66152 +
66153 +static int network_open(struct net_device *dev)
66154 +{
66155 +       struct netfront_info *np = netdev_priv(dev);
66156 +
66157 +       memset(&np->stats, 0, sizeof(np->stats));
66158 +
66159 +       spin_lock(&np->rx_lock);
66160 +       if (netif_carrier_ok(dev)) {
66161 +               network_alloc_rx_buffers(dev);
66162 +               np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
66163 +               if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
66164 +                       netif_rx_schedule(dev);
66165 +       }
66166 +       spin_unlock(&np->rx_lock);
66167 +
66168 +       netif_start_queue(dev);
66169 +
66170 +       return 0;
66171 +}
66172 +
66173 +static inline int netfront_tx_slot_available(struct netfront_info *np)
66174 +{
66175 +       return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 2;
66176 +}
66177 +
66178 +static inline void network_maybe_wake_tx(struct net_device *dev)
66179 +{
66180 +       struct netfront_info *np = netdev_priv(dev);
66181 +
66182 +       if (unlikely(netif_queue_stopped(dev)) &&
66183 +           netfront_tx_slot_available(np) &&
66184 +           likely(netif_running(dev)))
66185 +               netif_wake_queue(dev);
66186 +}
66187 +
66188 +static void network_tx_buf_gc(struct net_device *dev)
66189 +{
66190 +       RING_IDX cons, prod;
66191 +       unsigned short id;
66192 +       struct netfront_info *np = netdev_priv(dev);
66193 +       struct sk_buff *skb;
66194 +
66195 +       BUG_ON(!netif_carrier_ok(dev));
66196 +
66197 +       do {
66198 +               prod = np->tx.sring->rsp_prod;
66199 +               rmb(); /* Ensure we see responses up to 'rp'. */
66200 +
66201 +               for (cons = np->tx.rsp_cons; cons != prod; cons++) {
66202 +                       struct netif_tx_response *txrsp;
66203 +
66204 +                       txrsp = RING_GET_RESPONSE(&np->tx, cons);
66205 +                       if (txrsp->status == NETIF_RSP_NULL)
66206 +                               continue;
66207 +
66208 +                       id  = txrsp->id;
66209 +                       skb = np->tx_skbs[id];
66210 +                       if (unlikely(gnttab_query_foreign_access(
66211 +                               np->grant_tx_ref[id]) != 0)) {
66212 +                               printk(KERN_ALERT "network_tx_buf_gc: warning "
66213 +                                      "-- grant still in use by backend "
66214 +                                      "domain.\n");
66215 +                               BUG();
66216 +                       }
66217 +                       gnttab_end_foreign_access_ref(
66218 +                               np->grant_tx_ref[id], GNTMAP_readonly);
66219 +                       gnttab_release_grant_reference(
66220 +                               &np->gref_tx_head, np->grant_tx_ref[id]);
66221 +                       np->grant_tx_ref[id] = GRANT_INVALID_REF;
66222 +                       add_id_to_freelist(np->tx_skbs, id);
66223 +                       dev_kfree_skb_irq(skb);
66224 +               }
66225 +
66226 +               np->tx.rsp_cons = prod;
66227 +
66228 +               /*
66229 +                * Set a new event, then check for race with update of tx_cons.
66230 +                * Note that it is essential to schedule a callback, no matter
66231 +                * how few buffers are pending. Even if there is space in the
66232 +                * transmit ring, higher layers may be blocked because too much
66233 +                * data is outstanding: in such cases notification from Xen is
66234 +                * likely to be the only kick that we'll get.
66235 +                */
66236 +               np->tx.sring->rsp_event =
66237 +                       prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
66238 +               mb();
66239 +       } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
66240 +
66241 +       network_maybe_wake_tx(dev);
66242 +}
66243 +
66244 +static void rx_refill_timeout(unsigned long data)
66245 +{
66246 +       struct net_device *dev = (struct net_device *)data;
66247 +       netif_rx_schedule(dev);
66248 +}
66249 +
66250 +static void network_alloc_rx_buffers(struct net_device *dev)
66251 +{
66252 +       unsigned short id;
66253 +       struct netfront_info *np = netdev_priv(dev);
66254 +       struct sk_buff *skb;
66255 +       struct page *page;
66256 +       int i, batch_target, notify;
66257 +       RING_IDX req_prod = np->rx.req_prod_pvt;
66258 +       struct xen_memory_reservation reservation;
66259 +       grant_ref_t ref;
66260 +       unsigned long pfn;
66261 +       void *vaddr;
66262 +       int nr_flips;
66263 +       netif_rx_request_t *req;
66264 +
66265 +       if (unlikely(!netif_carrier_ok(dev)))
66266 +               return;
66267 +
66268 +       /*
66269 +        * Allocate skbuffs greedily, even though we batch updates to the
66270 +        * receive ring. This creates a less bursty demand on the memory
66271 +        * allocator, so should reduce the chance of failed allocation requests
66272 +        * both for ourself and for other kernel subsystems.
66273 +        */
66274 +       batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
66275 +       for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
66276 +               /*
66277 +                * Allocate an skb and a page. Do not use __dev_alloc_skb as
66278 +                * that will allocate page-sized buffers which is not
66279 +                * necessary here.
66280 +                * 16 bytes added as necessary headroom for netif_receive_skb.
66281 +                */
66282 +               skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
66283 +                               GFP_ATOMIC | __GFP_NOWARN);
66284 +               if (unlikely(!skb))
66285 +                       goto no_skb;
66286 +
66287 +               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
66288 +               if (!page) {
66289 +                       kfree_skb(skb);
66290 +no_skb:
66291 +                       /* Any skbuffs queued for refill? Force them out. */
66292 +                       if (i != 0)
66293 +                               goto refill;
66294 +                       /* Could not allocate any skbuffs. Try again later. */
66295 +                       mod_timer(&np->rx_refill_timer,
66296 +                                 jiffies + (HZ/10));
66297 +                       break;
66298 +               }
66299 +
66300 +               skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
66301 +               skb_shinfo(skb)->frags[0].page = page;
66302 +               skb_shinfo(skb)->nr_frags = 1;
66303 +               __skb_queue_tail(&np->rx_batch, skb);
66304 +       }
66305 +
66306 +       /* Is the batch large enough to be worthwhile? */
66307 +       if (i < (np->rx_target/2)) {
66308 +               if (req_prod > np->rx.sring->req_prod)
66309 +                       goto push;
66310 +               return;
66311 +       }
66312 +
66313 +       /* Adjust our fill target if we risked running out of buffers. */
66314 +       if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
66315 +           ((np->rx_target *= 2) > np->rx_max_target))
66316 +               np->rx_target = np->rx_max_target;
66317 +
66318 + refill:
66319 +       for (nr_flips = i = 0; ; i++) {
66320 +               if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
66321 +                       break;
66322 +
66323 +               skb->dev = dev;
66324 +
66325 +               id = xennet_rxidx(req_prod + i);
66326 +
66327 +               BUG_ON(np->rx_skbs[id]);
66328 +               np->rx_skbs[id] = skb;
66329 +
66330 +               ref = gnttab_claim_grant_reference(&np->gref_rx_head);
66331 +               BUG_ON((signed short)ref < 0);
66332 +               np->grant_rx_ref[id] = ref;
66333 +
66334 +               pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
66335 +               vaddr = page_address(skb_shinfo(skb)->frags[0].page);
66336 +
66337 +               req = RING_GET_REQUEST(&np->rx, req_prod + i);
66338 +               if (!np->copying_receiver) {
66339 +                       gnttab_grant_foreign_transfer_ref(ref,
66340 +                                                         np->xbdev->otherend_id,
66341 +                                                         pfn);
66342 +                       np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
66343 +                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
66344 +                               /* Remove this page before passing
66345 +                                * back to Xen. */
66346 +                               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
66347 +                               MULTI_update_va_mapping(np->rx_mcl+i,
66348 +                                                       (unsigned long)vaddr,
66349 +                                                       __pte(0), 0);
66350 +                       }
66351 +                       nr_flips++;
66352 +               } else {
66353 +                       gnttab_grant_foreign_access_ref(ref,
66354 +                                                       np->xbdev->otherend_id,
66355 +                                                       pfn_to_mfn(pfn),
66356 +                                                       0);
66357 +               }
66358 +
66359 +               req->id = id;
66360 +               req->gref = ref;
66361 +       }
66362 +
66363 +       if ( nr_flips != 0 ) {
66364 +               /* Tell the ballon driver what is going on. */
66365 +               balloon_update_driver_allowance(i);
66366 +
66367 +               set_xen_guest_handle(reservation.extent_start,
66368 +                                    np->rx_pfn_array);
66369 +               reservation.nr_extents   = nr_flips;
66370 +               reservation.extent_order = 0;
66371 +               reservation.address_bits = 0;
66372 +               reservation.domid        = DOMID_SELF;
66373 +
66374 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
66375 +                       /* After all PTEs have been zapped, flush the TLB. */
66376 +                       np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
66377 +                               UVMF_TLB_FLUSH|UVMF_ALL;
66378 +
66379 +                       /* Give away a batch of pages. */
66380 +                       np->rx_mcl[i].op = __HYPERVISOR_memory_op;
66381 +                       np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
66382 +                       np->rx_mcl[i].args[1] = (unsigned long)&reservation;
66383 +
66384 +                       /* Zap PTEs and give away pages in one big
66385 +                        * multicall. */
66386 +                       (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
66387 +
66388 +                       /* Check return status of HYPERVISOR_memory_op(). */
66389 +                       if (unlikely(np->rx_mcl[i].result != i))
66390 +                               panic("Unable to reduce memory reservation\n");
66391 +               } else {
66392 +                       if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
66393 +                                                &reservation) != i)
66394 +                               panic("Unable to reduce memory reservation\n");
66395 +               }
66396 +       } else {
66397 +               wmb();
66398 +       }
66399 +
66400 +       /* Above is a suitable barrier to ensure backend will see requests. */
66401 +       np->rx.req_prod_pvt = req_prod + i;
66402 + push:
66403 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
66404 +       if (notify)
66405 +               notify_remote_via_irq(np->irq);
66406 +}
66407 +
66408 +static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
66409 +                             struct netif_tx_request *tx)
66410 +{
66411 +       struct netfront_info *np = netdev_priv(dev);
66412 +       char *data = skb->data;
66413 +       unsigned long mfn;
66414 +       RING_IDX prod = np->tx.req_prod_pvt;
66415 +       int frags = skb_shinfo(skb)->nr_frags;
66416 +       unsigned int offset = offset_in_page(data);
66417 +       unsigned int len = skb_headlen(skb);
66418 +       unsigned int id;
66419 +       grant_ref_t ref;
66420 +       int i;
66421 +
66422 +       while (len > PAGE_SIZE - offset) {
66423 +               tx->size = PAGE_SIZE - offset;
66424 +               tx->flags |= NETTXF_more_data;
66425 +               len -= tx->size;
66426 +               data += tx->size;
66427 +               offset = 0;
66428 +
66429 +               id = get_id_from_freelist(np->tx_skbs);
66430 +               np->tx_skbs[id] = skb_get(skb);
66431 +               tx = RING_GET_REQUEST(&np->tx, prod++);
66432 +               tx->id = id;
66433 +               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
66434 +               BUG_ON((signed short)ref < 0);
66435 +
66436 +               mfn = virt_to_mfn(data);
66437 +               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
66438 +                                               mfn, GNTMAP_readonly);
66439 +
66440 +               tx->gref = np->grant_tx_ref[id] = ref;
66441 +               tx->offset = offset;
66442 +               tx->size = len;
66443 +               tx->flags = 0;
66444 +       }
66445 +
66446 +       for (i = 0; i < frags; i++) {
66447 +               skb_frag_t *frag = skb_shinfo(skb)->frags + i;
66448 +
66449 +               tx->flags |= NETTXF_more_data;
66450 +
66451 +               id = get_id_from_freelist(np->tx_skbs);
66452 +               np->tx_skbs[id] = skb_get(skb);
66453 +               tx = RING_GET_REQUEST(&np->tx, prod++);
66454 +               tx->id = id;
66455 +               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
66456 +               BUG_ON((signed short)ref < 0);
66457 +
66458 +               mfn = pfn_to_mfn(page_to_pfn(frag->page));
66459 +               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
66460 +                                               mfn, GNTMAP_readonly);
66461 +
66462 +               tx->gref = np->grant_tx_ref[id] = ref;
66463 +               tx->offset = frag->page_offset;
66464 +               tx->size = frag->size;
66465 +               tx->flags = 0;
66466 +       }
66467 +
66468 +       np->tx.req_prod_pvt = prod;
66469 +}
66470 +
66471 +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
66472 +{
66473 +       unsigned short id;
66474 +       struct netfront_info *np = netdev_priv(dev);
66475 +       struct netif_tx_request *tx;
66476 +       struct netif_extra_info *extra;
66477 +       char *data = skb->data;
66478 +       RING_IDX i;
66479 +       grant_ref_t ref;
66480 +       unsigned long mfn;
66481 +       int notify;
66482 +       int frags = skb_shinfo(skb)->nr_frags;
66483 +       unsigned int offset = offset_in_page(data);
66484 +       unsigned int len = skb_headlen(skb);
66485 +
66486 +       frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
66487 +       if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
66488 +               printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
66489 +                      frags);
66490 +               dump_stack();
66491 +               goto drop;
66492 +       }
66493 +
66494 +       spin_lock_irq(&np->tx_lock);
66495 +
66496 +       if (unlikely(!netif_carrier_ok(dev) ||
66497 +                    (frags > 1 && !xennet_can_sg(dev)) ||
66498 +                    netif_needs_gso(dev, skb))) {
66499 +               spin_unlock_irq(&np->tx_lock);
66500 +               goto drop;
66501 +       }
66502 +
66503 +       i = np->tx.req_prod_pvt;
66504 +
66505 +       id = get_id_from_freelist(np->tx_skbs);
66506 +       np->tx_skbs[id] = skb;
66507 +
66508 +       tx = RING_GET_REQUEST(&np->tx, i);
66509 +
66510 +       tx->id   = id;
66511 +       ref = gnttab_claim_grant_reference(&np->gref_tx_head);
66512 +       BUG_ON((signed short)ref < 0);
66513 +       mfn = virt_to_mfn(data);
66514 +       gnttab_grant_foreign_access_ref(
66515 +               ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
66516 +       tx->gref = np->grant_tx_ref[id] = ref;
66517 +       tx->offset = offset;
66518 +       tx->size = len;
66519 +
66520 +       tx->flags = 0;
66521 +       extra = NULL;
66522 +
66523 +       if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
66524 +               tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
66525 +#ifdef CONFIG_XEN
66526 +       if (skb->proto_data_valid) /* remote but checksummed? */
66527 +               tx->flags |= NETTXF_data_validated;
66528 +#endif
66529 +
66530 +#ifdef HAVE_TSO
66531 +       if (skb_is_gso(skb)) {
66532 +               struct netif_extra_info *gso = (struct netif_extra_info *)
66533 +                       RING_GET_REQUEST(&np->tx, ++i);
66534 +
66535 +               if (extra)
66536 +                       extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
66537 +               else
66538 +                       tx->flags |= NETTXF_extra_info;
66539 +
66540 +               gso->u.gso.size = skb_shinfo(skb)->gso_size;
66541 +               gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
66542 +               gso->u.gso.pad = 0;
66543 +               gso->u.gso.features = 0;
66544 +
66545 +               gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
66546 +               gso->flags = 0;
66547 +               extra = gso;
66548 +       }
66549 +#endif
66550 +
66551 +       np->tx.req_prod_pvt = i + 1;
66552 +
66553 +       xennet_make_frags(skb, dev, tx);
66554 +       tx->size = skb->len;
66555 +
66556 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
66557 +       if (notify)
66558 +               notify_remote_via_irq(np->irq);
66559 +
66560 +       network_tx_buf_gc(dev);
66561 +
66562 +       if (!netfront_tx_slot_available(np))
66563 +               netif_stop_queue(dev);
66564 +
66565 +       spin_unlock_irq(&np->tx_lock);
66566 +
66567 +       np->stats.tx_bytes += skb->len;
66568 +       np->stats.tx_packets++;
66569 +
66570 +       return 0;
66571 +
66572 + drop:
66573 +       np->stats.tx_dropped++;
66574 +       dev_kfree_skb(skb);
66575 +       return 0;
66576 +}
66577 +
66578 +static irqreturn_t netif_int(int irq, void *dev_id)
66579 +{
66580 +       struct net_device *dev = dev_id;
66581 +       struct netfront_info *np = netdev_priv(dev);
66582 +       unsigned long flags;
66583 +
66584 +       spin_lock_irqsave(&np->tx_lock, flags);
66585 +
66586 +       if (likely(netif_carrier_ok(dev))) {
66587 +               network_tx_buf_gc(dev);
66588 +               /* Under tx_lock: protects access to rx shared-ring indexes. */
66589 +               if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
66590 +                       netif_rx_schedule(dev);
66591 +       }
66592 +
66593 +       spin_unlock_irqrestore(&np->tx_lock, flags);
66594 +
66595 +       return IRQ_HANDLED;
66596 +}
66597 +
66598 +static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
66599 +                               grant_ref_t ref)
66600 +{
66601 +       int new = xennet_rxidx(np->rx.req_prod_pvt);
66602 +
66603 +       BUG_ON(np->rx_skbs[new]);
66604 +       np->rx_skbs[new] = skb;
66605 +       np->grant_rx_ref[new] = ref;
66606 +       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
66607 +       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
66608 +       np->rx.req_prod_pvt++;
66609 +}
66610 +
66611 +int xennet_get_extras(struct netfront_info *np,
66612 +                     struct netif_extra_info *extras, RING_IDX rp)
66613 +
66614 +{
66615 +       struct netif_extra_info *extra;
66616 +       RING_IDX cons = np->rx.rsp_cons;
66617 +       int err = 0;
66618 +
66619 +       do {
66620 +               struct sk_buff *skb;
66621 +               grant_ref_t ref;
66622 +
66623 +               if (unlikely(cons + 1 == rp)) {
66624 +                       if (net_ratelimit())
66625 +                               WPRINTK("Missing extra info\n");
66626 +                       err = -EBADR;
66627 +                       break;
66628 +               }
66629 +
66630 +               extra = (struct netif_extra_info *)
66631 +                       RING_GET_RESPONSE(&np->rx, ++cons);
66632 +
66633 +               if (unlikely(!extra->type ||
66634 +                            extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
66635 +                       if (net_ratelimit())
66636 +                               WPRINTK("Invalid extra type: %d\n",
66637 +                                       extra->type);
66638 +                       err = -EINVAL;
66639 +               } else {
66640 +                       memcpy(&extras[extra->type - 1], extra,
66641 +                              sizeof(*extra));
66642 +               }
66643 +
66644 +               skb = xennet_get_rx_skb(np, cons);
66645 +               ref = xennet_get_rx_ref(np, cons);
66646 +               xennet_move_rx_slot(np, skb, ref);
66647 +       } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
66648 +
66649 +       np->rx.rsp_cons = cons;
66650 +       return err;
66651 +}
66652 +
66653 +static int xennet_get_responses(struct netfront_info *np,
66654 +                               struct netfront_rx_info *rinfo, RING_IDX rp,
66655 +                               struct sk_buff_head *list,
66656 +                               int *pages_flipped_p)
66657 +{
66658 +       int pages_flipped = *pages_flipped_p;
66659 +       struct mmu_update *mmu;
66660 +       struct multicall_entry *mcl;
66661 +       struct netif_rx_response *rx = &rinfo->rx;
66662 +       struct netif_extra_info *extras = rinfo->extras;
66663 +       RING_IDX cons = np->rx.rsp_cons;
66664 +       struct sk_buff *skb = xennet_get_rx_skb(np, cons);
66665 +       grant_ref_t ref = xennet_get_rx_ref(np, cons);
66666 +       int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
66667 +       int frags = 1;
66668 +       int err = 0;
66669 +       unsigned long ret;
66670 +
66671 +       if (rx->flags & NETRXF_extra_info) {
66672 +               err = xennet_get_extras(np, extras, rp);
66673 +               cons = np->rx.rsp_cons;
66674 +       }
66675 +
66676 +       for (;;) {
66677 +               unsigned long mfn;
66678 +
66679 +               if (unlikely(rx->status < 0 ||
66680 +                            rx->offset + rx->status > PAGE_SIZE)) {
66681 +                       if (net_ratelimit())
66682 +                               WPRINTK("rx->offset: %x, size: %u\n",
66683 +                                       rx->offset, rx->status);
66684 +                       xennet_move_rx_slot(np, skb, ref);
66685 +                       err = -EINVAL;
66686 +                       goto next;
66687 +               }
66688 +
66689 +               /*
66690 +                * This definitely indicates a bug, either in this driver or in
66691 +                * the backend driver. In future this should flag the bad
66692 +                * situation to the system controller to reboot the backed.
66693 +                */
66694 +               if (ref == GRANT_INVALID_REF) {
66695 +                       if (net_ratelimit())
66696 +                               WPRINTK("Bad rx response id %d.\n", rx->id);
66697 +                       err = -EINVAL;
66698 +                       goto next;
66699 +               }
66700 +
66701 +               if (!np->copying_receiver) {
66702 +                       /* Memory pressure, insufficient buffer
66703 +                        * headroom, ... */
66704 +                       if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
66705 +                               if (net_ratelimit())
66706 +                                       WPRINTK("Unfulfilled rx req "
66707 +                                               "(id=%d, st=%d).\n",
66708 +                                               rx->id, rx->status);
66709 +                               xennet_move_rx_slot(np, skb, ref);
66710 +                               err = -ENOMEM;
66711 +                               goto next;
66712 +                       }
66713 +
66714 +                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
66715 +                               /* Remap the page. */
66716 +                               struct page *page =
66717 +                                       skb_shinfo(skb)->frags[0].page;
66718 +                               unsigned long pfn = page_to_pfn(page);
66719 +                               void *vaddr = page_address(page);
66720 +
66721 +                               mcl = np->rx_mcl + pages_flipped;
66722 +                               mmu = np->rx_mmu + pages_flipped;
66723 +
66724 +                               MULTI_update_va_mapping(mcl,
66725 +                                                       (unsigned long)vaddr,
66726 +                                                       pfn_pte_ma(mfn,
66727 +                                                                  PAGE_KERNEL),
66728 +                                                       0);
66729 +                               mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
66730 +                                       | MMU_MACHPHYS_UPDATE;
66731 +                               mmu->val = pfn;
66732 +
66733 +                               set_phys_to_machine(pfn, mfn);
66734 +                       }
66735 +                       pages_flipped++;
66736 +               } else {
66737 +                       ret = gnttab_end_foreign_access_ref(ref, 0);
66738 +                       BUG_ON(!ret);
66739 +               }
66740 +
66741 +               gnttab_release_grant_reference(&np->gref_rx_head, ref);
66742 +
66743 +               __skb_queue_tail(list, skb);
66744 +
66745 +next:
66746 +               if (!(rx->flags & NETRXF_more_data))
66747 +                       break;
66748 +
66749 +               if (cons + frags == rp) {
66750 +                       if (net_ratelimit())
66751 +                               WPRINTK("Need more frags\n");
66752 +                       err = -ENOENT;
66753 +                       break;
66754 +               }
66755 +
66756 +               rx = RING_GET_RESPONSE(&np->rx, cons + frags);
66757 +               skb = xennet_get_rx_skb(np, cons + frags);
66758 +               ref = xennet_get_rx_ref(np, cons + frags);
66759 +               frags++;
66760 +       }
66761 +
66762 +       if (unlikely(frags > max)) {
66763 +               if (net_ratelimit())
66764 +                       WPRINTK("Too many frags\n");
66765 +               err = -E2BIG;
66766 +       }
66767 +
66768 +       if (unlikely(err))
66769 +               np->rx.rsp_cons = cons + frags;
66770 +
66771 +       *pages_flipped_p = pages_flipped;
66772 +
66773 +       return err;
66774 +}
66775 +
66776 +static RING_IDX xennet_fill_frags(struct netfront_info *np,
66777 +                                 struct sk_buff *skb,
66778 +                                 struct sk_buff_head *list)
66779 +{
66780 +       struct skb_shared_info *shinfo = skb_shinfo(skb);
66781 +       int nr_frags = shinfo->nr_frags;
66782 +       RING_IDX cons = np->rx.rsp_cons;
66783 +       skb_frag_t *frag = shinfo->frags + nr_frags;
66784 +       struct sk_buff *nskb;
66785 +
66786 +       while ((nskb = __skb_dequeue(list))) {
66787 +               struct netif_rx_response *rx =
66788 +                       RING_GET_RESPONSE(&np->rx, ++cons);
66789 +
66790 +               frag->page = skb_shinfo(nskb)->frags[0].page;
66791 +               frag->page_offset = rx->offset;
66792 +               frag->size = rx->status;
66793 +
66794 +               skb->data_len += rx->status;
66795 +
66796 +               skb_shinfo(nskb)->nr_frags = 0;
66797 +               kfree_skb(nskb);
66798 +
66799 +               frag++;
66800 +               nr_frags++;
66801 +       }
66802 +
66803 +       shinfo->nr_frags = nr_frags;
66804 +       return cons;
66805 +}
66806 +
66807 +static int xennet_set_skb_gso(struct sk_buff *skb,
66808 +                             struct netif_extra_info *gso)
66809 +{
66810 +       if (!gso->u.gso.size) {
66811 +               if (net_ratelimit())
66812 +                       WPRINTK("GSO size must not be zero.\n");
66813 +               return -EINVAL;
66814 +       }
66815 +
66816 +       /* Currently only TCPv4 S.O. is supported. */
66817 +       if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
66818 +               if (net_ratelimit())
66819 +                       WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
66820 +               return -EINVAL;
66821 +       }
66822 +
66823 +#ifdef HAVE_TSO
66824 +       skb_shinfo(skb)->gso_size = gso->u.gso.size;
66825 +#ifdef HAVE_GSO
66826 +       skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
66827 +
66828 +       /* Header must be checked, and gso_segs computed. */
66829 +       skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
66830 +#endif
66831 +       skb_shinfo(skb)->gso_segs = 0;
66832 +
66833 +       return 0;
66834 +#else
66835 +       if (net_ratelimit())
66836 +               WPRINTK("GSO unsupported by this kernel.\n");
66837 +       return -EINVAL;
66838 +#endif
66839 +}
66840 +
66841 +static int netif_poll(struct net_device *dev, int *pbudget)
66842 +{
66843 +       struct netfront_info *np = netdev_priv(dev);
66844 +       struct sk_buff *skb;
66845 +       struct netfront_rx_info rinfo;
66846 +       struct netif_rx_response *rx = &rinfo.rx;
66847 +       struct netif_extra_info *extras = rinfo.extras;
66848 +       RING_IDX i, rp;
66849 +       struct multicall_entry *mcl;
66850 +       int work_done, budget, more_to_do = 1;
66851 +       struct sk_buff_head rxq;
66852 +       struct sk_buff_head errq;
66853 +       struct sk_buff_head tmpq;
66854 +       unsigned long flags;
66855 +       unsigned int len;
66856 +       int pages_flipped = 0;
66857 +       int err;
66858 +
66859 +       spin_lock(&np->rx_lock);
66860 +
66861 +       if (unlikely(!netif_carrier_ok(dev))) {
66862 +               spin_unlock(&np->rx_lock);
66863 +               return 0;
66864 +       }
66865 +
66866 +       skb_queue_head_init(&rxq);
66867 +       skb_queue_head_init(&errq);
66868 +       skb_queue_head_init(&tmpq);
66869 +
66870 +       if ((budget = *pbudget) > dev->quota)
66871 +               budget = dev->quota;
66872 +       rp = np->rx.sring->rsp_prod;
66873 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
66874 +
66875 +       i = np->rx.rsp_cons;
66876 +       work_done = 0;
66877 +       while ((i != rp) && (work_done < budget)) {
66878 +               memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
66879 +               memset(extras, 0, sizeof(extras));
66880 +
66881 +               err = xennet_get_responses(np, &rinfo, rp, &tmpq,
66882 +                                          &pages_flipped);
66883 +
66884 +               if (unlikely(err)) {
66885 +err:   
66886 +                       while ((skb = __skb_dequeue(&tmpq)))
66887 +                               __skb_queue_tail(&errq, skb);
66888 +                       np->stats.rx_errors++;
66889 +                       i = np->rx.rsp_cons;
66890 +                       continue;
66891 +               }
66892 +
66893 +               skb = __skb_dequeue(&tmpq);
66894 +
66895 +               if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
66896 +                       struct netif_extra_info *gso;
66897 +                       gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
66898 +
66899 +                       if (unlikely(xennet_set_skb_gso(skb, gso))) {
66900 +                               __skb_queue_head(&tmpq, skb);
66901 +                               np->rx.rsp_cons += skb_queue_len(&tmpq);
66902 +                               goto err;
66903 +                       }
66904 +               }
66905 +
66906 +               skb->nh.raw = (void *)skb_shinfo(skb)->frags[0].page;
66907 +               skb->h.raw = skb->nh.raw + rx->offset;
66908 +
66909 +               len = rx->status;
66910 +               if (len > RX_COPY_THRESHOLD)
66911 +                       len = RX_COPY_THRESHOLD;
66912 +               skb_put(skb, len);
66913 +
66914 +               if (rx->status > len) {
66915 +                       skb_shinfo(skb)->frags[0].page_offset =
66916 +                               rx->offset + len;
66917 +                       skb_shinfo(skb)->frags[0].size = rx->status - len;
66918 +                       skb->data_len = rx->status - len;
66919 +               } else {
66920 +                       skb_shinfo(skb)->frags[0].page = NULL;
66921 +                       skb_shinfo(skb)->nr_frags = 0;
66922 +               }
66923 +
66924 +               i = xennet_fill_frags(np, skb, &tmpq);
66925 +
66926 +               /*
66927 +                * Truesize must approximates the size of true data plus
66928 +                * any supervisor overheads. Adding hypervisor overheads
66929 +                * has been shown to significantly reduce achievable
66930 +                * bandwidth with the default receive buffer size. It is
66931 +                * therefore not wise to account for it here.
66932 +                *
66933 +                * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
66934 +                * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
66935 +                * add the size of the data pulled in xennet_fill_frags().
66936 +                *
66937 +                * We also adjust for any unused space in the main data
66938 +                * area by subtracting (RX_COPY_THRESHOLD - len). This is
66939 +                * especially important with drivers which split incoming
66940 +                * packets into header and data, using only 66 bytes of
66941 +                * the main data area (see the e1000 driver for example.)
66942 +                * On such systems, without this last adjustement, our
66943 +                * achievable receive throughout using the standard receive
66944 +                * buffer size was cut by 25%(!!!).
66945 +                */
66946 +               skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
66947 +               skb->len += skb->data_len;
66948 +
66949 +               /*
66950 +                * Old backends do not assert data_validated but we
66951 +                * can infer it from csum_blank so test both flags.
66952 +                */
66953 +               if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank))
66954 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
66955 +               else
66956 +                       skb->ip_summed = CHECKSUM_NONE;
66957 +#ifdef CONFIG_XEN
66958 +               skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE);
66959 +               skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
66960 +#endif
66961 +               np->stats.rx_packets++;
66962 +               np->stats.rx_bytes += skb->len;
66963 +
66964 +               __skb_queue_tail(&rxq, skb);
66965 +
66966 +               np->rx.rsp_cons = ++i;
66967 +               work_done++;
66968 +       }
66969 +
66970 +       if (pages_flipped) {
66971 +               /* Some pages are no longer absent... */
66972 +               balloon_update_driver_allowance(-pages_flipped);
66973 +
66974 +               /* Do all the remapping work and M2P updates. */
66975 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
66976 +                       mcl = np->rx_mcl + pages_flipped;
66977 +                       mcl->op = __HYPERVISOR_mmu_update;
66978 +                       mcl->args[0] = (unsigned long)np->rx_mmu;
66979 +                       mcl->args[1] = pages_flipped;
66980 +                       mcl->args[2] = 0;
66981 +                       mcl->args[3] = DOMID_SELF;
66982 +                       (void)HYPERVISOR_multicall(np->rx_mcl,
66983 +                                                  pages_flipped + 1);
66984 +               }
66985 +       }
66986 +
66987 +       while ((skb = __skb_dequeue(&errq)))
66988 +               kfree_skb(skb);
66989 +
66990 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
66991 +               struct page *page = (struct page *)skb->nh.raw;
66992 +               void *vaddr = page_address(page);
66993 +
66994 +               memcpy(skb->data, vaddr + (skb->h.raw - skb->nh.raw),
66995 +                      skb_headlen(skb));
66996 +
66997 +               if (page != skb_shinfo(skb)->frags[0].page)
66998 +                       __free_page(page);
66999 +
67000 +               /* Ethernet work: Delayed to here as it peeks the header. */
67001 +               skb->protocol = eth_type_trans(skb, dev);
67002 +
67003 +               /* Pass it up. */
67004 +               netif_receive_skb(skb);
67005 +               dev->last_rx = jiffies;
67006 +       }
67007 +
67008 +       /* If we get a callback with very few responses, reduce fill target. */
67009 +       /* NB. Note exponential increase, linear decrease. */
67010 +       if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
67011 +            ((3*np->rx_target) / 4)) &&
67012 +           (--np->rx_target < np->rx_min_target))
67013 +               np->rx_target = np->rx_min_target;
67014 +
67015 +       network_alloc_rx_buffers(dev);
67016 +
67017 +       *pbudget   -= work_done;
67018 +       dev->quota -= work_done;
67019 +
67020 +       if (work_done < budget) {
67021 +               local_irq_save(flags);
67022 +
67023 +               RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
67024 +               if (!more_to_do)
67025 +                       __netif_rx_complete(dev);
67026 +
67027 +               local_irq_restore(flags);
67028 +       }
67029 +
67030 +       spin_unlock(&np->rx_lock);
67031 +
67032 +       return more_to_do;
67033 +}
67034 +
67035 +static void netif_release_tx_bufs(struct netfront_info *np)
67036 +{
67037 +       struct sk_buff *skb;
67038 +       int i;
67039 +
67040 +       for (i = 1; i <= NET_TX_RING_SIZE; i++) {
67041 +               if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
67042 +                       continue;
67043 +
67044 +               skb = np->tx_skbs[i];
67045 +               gnttab_end_foreign_access_ref(
67046 +                       np->grant_tx_ref[i], GNTMAP_readonly);
67047 +               gnttab_release_grant_reference(
67048 +                       &np->gref_tx_head, np->grant_tx_ref[i]);
67049 +               np->grant_tx_ref[i] = GRANT_INVALID_REF;
67050 +               add_id_to_freelist(np->tx_skbs, i);
67051 +               dev_kfree_skb_irq(skb);
67052 +       }
67053 +}
67054 +
67055 +static void netif_release_rx_bufs(struct netfront_info *np)
67056 +{
67057 +       struct mmu_update      *mmu = np->rx_mmu;
67058 +       struct multicall_entry *mcl = np->rx_mcl;
67059 +       struct sk_buff_head free_list;
67060 +       struct sk_buff *skb;
67061 +       unsigned long mfn;
67062 +       int xfer = 0, noxfer = 0, unused = 0;
67063 +       int id, ref;
67064 +
67065 +       if (np->copying_receiver) {
67066 +               printk("%s: fix me for copying receiver.\n", __FUNCTION__);
67067 +               return;
67068 +       }
67069 +
67070 +       skb_queue_head_init(&free_list);
67071 +
67072 +       spin_lock(&np->rx_lock);
67073 +
67074 +       for (id = 0; id < NET_RX_RING_SIZE; id++) {
67075 +               if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
67076 +                       unused++;
67077 +                       continue;
67078 +               }
67079 +
67080 +               skb = np->rx_skbs[id];
67081 +               mfn = gnttab_end_foreign_transfer_ref(ref);
67082 +               gnttab_release_grant_reference(&np->gref_rx_head, ref);
67083 +               np->grant_rx_ref[id] = GRANT_INVALID_REF;
67084 +               add_id_to_freelist(np->rx_skbs, id);
67085 +
67086 +               if (0 == mfn) {
67087 +                       struct page *page = skb_shinfo(skb)->frags[0].page;
67088 +                       balloon_release_driver_page(page);
67089 +                       skb_shinfo(skb)->nr_frags = 0;
67090 +                       dev_kfree_skb(skb);
67091 +                       noxfer++;
67092 +                       continue;
67093 +               }
67094 +
67095 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
67096 +                       /* Remap the page. */
67097 +                       struct page *page = skb_shinfo(skb)->frags[0].page;
67098 +                       unsigned long pfn = page_to_pfn(page);
67099 +                       void *vaddr = page_address(page);
67100 +
67101 +                       MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
67102 +                                               pfn_pte_ma(mfn, PAGE_KERNEL),
67103 +                                               0);
67104 +                       mcl++;
67105 +                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
67106 +                               | MMU_MACHPHYS_UPDATE;
67107 +                       mmu->val = pfn;
67108 +                       mmu++;
67109 +
67110 +                       set_phys_to_machine(pfn, mfn);
67111 +               }
67112 +               __skb_queue_tail(&free_list, skb);
67113 +               xfer++;
67114 +       }
67115 +
67116 +       printk("%s: %d xfer, %d noxfer, %d unused\n",
67117 +              __FUNCTION__, xfer, noxfer, unused);
67118 +
67119 +       if (xfer) {
67120 +               /* Some pages are no longer absent... */
67121 +               balloon_update_driver_allowance(-xfer);
67122 +
67123 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
67124 +                       /* Do all the remapping work and M2P updates. */
67125 +                       mcl->op = __HYPERVISOR_mmu_update;
67126 +                       mcl->args[0] = (unsigned long)np->rx_mmu;
67127 +                       mcl->args[1] = mmu - np->rx_mmu;
67128 +                       mcl->args[2] = 0;
67129 +                       mcl->args[3] = DOMID_SELF;
67130 +                       mcl++;
67131 +                       HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
67132 +               }
67133 +       }
67134 +
67135 +       while ((skb = __skb_dequeue(&free_list)) != NULL)
67136 +               dev_kfree_skb(skb);
67137 +
67138 +       spin_unlock(&np->rx_lock);
67139 +}
67140 +
67141 +static int network_close(struct net_device *dev)
67142 +{
67143 +       struct netfront_info *np = netdev_priv(dev);
67144 +       netif_stop_queue(np->netdev);
67145 +       return 0;
67146 +}
67147 +
67148 +
67149 +static struct net_device_stats *network_get_stats(struct net_device *dev)
67150 +{
67151 +       struct netfront_info *np = netdev_priv(dev);
67152 +       return &np->stats;
67153 +}
67154 +
67155 +static int xennet_change_mtu(struct net_device *dev, int mtu)
67156 +{
67157 +       int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
67158 +
67159 +       if (mtu > max)
67160 +               return -EINVAL;
67161 +       dev->mtu = mtu;
67162 +       return 0;
67163 +}
67164 +
67165 +static int xennet_set_sg(struct net_device *dev, u32 data)
67166 +{
67167 +       if (data) {
67168 +               struct netfront_info *np = netdev_priv(dev);
67169 +               int val;
67170 +
67171 +               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
67172 +                                "%d", &val) < 0)
67173 +                       val = 0;
67174 +               if (!val)
67175 +                       return -ENOSYS;
67176 +       } else if (dev->mtu > ETH_DATA_LEN)
67177 +               dev->mtu = ETH_DATA_LEN;
67178 +
67179 +       return ethtool_op_set_sg(dev, data);
67180 +}
67181 +
67182 +static int xennet_set_tso(struct net_device *dev, u32 data)
67183 +{
67184 +#ifdef HAVE_TSO
67185 +       if (data) {
67186 +               struct netfront_info *np = netdev_priv(dev);
67187 +               int val;
67188 +
67189 +               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
67190 +                                "feature-gso-tcpv4", "%d", &val) < 0)
67191 +                       val = 0;
67192 +               if (!val)
67193 +                       return -ENOSYS;
67194 +       }
67195 +
67196 +       return ethtool_op_set_tso(dev, data);
67197 +#else
67198 +       return -ENOSYS;
67199 +#endif
67200 +}
67201 +
67202 +static void xennet_set_features(struct net_device *dev)
67203 +{
67204 +       dev_disable_gso_features(dev);
67205 +       xennet_set_sg(dev, 0);
67206 +
67207 +       /* We need checksum offload to enable scatter/gather and TSO. */
67208 +       if (!(dev->features & NETIF_F_IP_CSUM))
67209 +               return;
67210 +
67211 +       if (xennet_set_sg(dev, 1))
67212 +               return;
67213 +
67214 +       /* Before 2.6.9 TSO seems to be unreliable so do not enable it
67215 +        * on older kernels.
67216 +        */
67217 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
67218 +       xennet_set_tso(dev, 1);
67219 +#endif
67220 +
67221 +}
67222 +
67223 +static int network_connect(struct net_device *dev)
67224 +{
67225 +       struct netfront_info *np = netdev_priv(dev);
67226 +       int i, requeue_idx, err;
67227 +       struct sk_buff *skb;
67228 +       grant_ref_t ref;
67229 +       netif_rx_request_t *req;
67230 +       unsigned int feature_rx_copy, feature_rx_flip;
67231 +
67232 +       err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
67233 +                          "feature-rx-copy", "%u", &feature_rx_copy);
67234 +       if (err != 1)
67235 +               feature_rx_copy = 0;
67236 +       err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
67237 +                          "feature-rx-flip", "%u", &feature_rx_flip);
67238 +       if (err != 1)
67239 +               feature_rx_flip = 1;
67240 +
67241 +       /*
67242 +        * Copy packets on receive path if:
67243 +        *  (a) This was requested by user, and the backend supports it; or
67244 +        *  (b) Flipping was requested, but this is unsupported by the backend.
67245 +        */
67246 +       np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
67247 +                               (MODPARM_rx_flip && !feature_rx_flip));
67248 +
67249 +       err = talk_to_backend(np->xbdev, np);
67250 +       if (err)
67251 +               return err;
67252 +
67253 +       xennet_set_features(dev);
67254 +
67255 +       IPRINTK("device %s has %sing receive path.\n",
67256 +               dev->name, np->copying_receiver ? "copy" : "flipp");
67257 +
67258 +       spin_lock_irq(&np->tx_lock);
67259 +       spin_lock(&np->rx_lock);
67260 +
67261 +       /*
67262 +        * Recovery procedure:
67263 +        *  NB. Freelist index entries are always going to be less than
67264 +        *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
67265 +        *  greater than PAGE_OFFSET: we use this property to distinguish
67266 +        *  them.
67267 +        */
67268 +
67269 +       /* Step 1: Discard all pending TX packet fragments. */
67270 +       netif_release_tx_bufs(np);
67271 +
67272 +       /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
67273 +       for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
67274 +               if (!np->rx_skbs[i])
67275 +                       continue;
67276 +
67277 +               skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
67278 +               ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
67279 +               req = RING_GET_REQUEST(&np->rx, requeue_idx);
67280 +
67281 +               if (!np->copying_receiver) {
67282 +                       gnttab_grant_foreign_transfer_ref(
67283 +                               ref, np->xbdev->otherend_id,
67284 +                               page_to_pfn(skb_shinfo(skb)->frags->page));
67285 +               } else {
67286 +                       gnttab_grant_foreign_access_ref(
67287 +                               ref, np->xbdev->otherend_id,
67288 +                               pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
67289 +                                                      frags->page)),
67290 +                               0);
67291 +               }
67292 +               req->gref = ref;
67293 +               req->id   = requeue_idx;
67294 +
67295 +               requeue_idx++;
67296 +       }
67297 +
67298 +       np->rx.req_prod_pvt = requeue_idx;
67299 +
67300 +       /*
67301 +        * Step 3: All public and private state should now be sane.  Get
67302 +        * ready to start sending and receiving packets and give the driver
67303 +        * domain a kick because we've probably just requeued some
67304 +        * packets.
67305 +        */
67306 +       netif_carrier_on(dev);
67307 +       notify_remote_via_irq(np->irq);
67308 +       network_tx_buf_gc(dev);
67309 +       network_alloc_rx_buffers(dev);
67310 +
67311 +       spin_unlock(&np->rx_lock);
67312 +       spin_unlock_irq(&np->tx_lock);
67313 +
67314 +       return 0;
67315 +}
67316 +
67317 +static void netif_uninit(struct net_device *dev)
67318 +{
67319 +       struct netfront_info *np = netdev_priv(dev);
67320 +       netif_release_tx_bufs(np);
67321 +       netif_release_rx_bufs(np);
67322 +       gnttab_free_grant_references(np->gref_tx_head);
67323 +       gnttab_free_grant_references(np->gref_rx_head);
67324 +}
67325 +
67326 +static struct ethtool_ops network_ethtool_ops =
67327 +{
67328 +       .get_tx_csum = ethtool_op_get_tx_csum,
67329 +       .set_tx_csum = ethtool_op_set_tx_csum,
67330 +       .get_sg = ethtool_op_get_sg,
67331 +       .set_sg = xennet_set_sg,
67332 +       .get_tso = ethtool_op_get_tso,
67333 +       .set_tso = xennet_set_tso,
67334 +       .get_link = ethtool_op_get_link,
67335 +};
67336 +
67337 +#ifdef CONFIG_SYSFS
67338 +static ssize_t show_rxbuf_min(struct class_device *cd, char *buf)
67339 +{
67340 +       struct net_device *netdev = container_of(cd, struct net_device,
67341 +                                                class_dev);
67342 +       struct netfront_info *info = netdev_priv(netdev);
67343 +
67344 +       return sprintf(buf, "%u\n", info->rx_min_target);
67345 +}
67346 +
67347 +static ssize_t store_rxbuf_min(struct class_device *cd,
67348 +                              const char *buf, size_t len)
67349 +{
67350 +       struct net_device *netdev = container_of(cd, struct net_device,
67351 +                                                class_dev);
67352 +       struct netfront_info *np = netdev_priv(netdev);
67353 +       char *endp;
67354 +       unsigned long target;
67355 +
67356 +       if (!capable(CAP_NET_ADMIN))
67357 +               return -EPERM;
67358 +
67359 +       target = simple_strtoul(buf, &endp, 0);
67360 +       if (endp == buf)
67361 +               return -EBADMSG;
67362 +
67363 +       if (target < RX_MIN_TARGET)
67364 +               target = RX_MIN_TARGET;
67365 +       if (target > RX_MAX_TARGET)
67366 +               target = RX_MAX_TARGET;
67367 +
67368 +       spin_lock(&np->rx_lock);
67369 +       if (target > np->rx_max_target)
67370 +               np->rx_max_target = target;
67371 +       np->rx_min_target = target;
67372 +       if (target > np->rx_target)
67373 +               np->rx_target = target;
67374 +
67375 +       network_alloc_rx_buffers(netdev);
67376 +
67377 +       spin_unlock(&np->rx_lock);
67378 +       return len;
67379 +}
67380 +
67381 +static ssize_t show_rxbuf_max(struct class_device *cd, char *buf)
67382 +{
67383 +       struct net_device *netdev = container_of(cd, struct net_device,
67384 +                                                class_dev);
67385 +       struct netfront_info *info = netdev_priv(netdev);
67386 +
67387 +       return sprintf(buf, "%u\n", info->rx_max_target);
67388 +}
67389 +
67390 +static ssize_t store_rxbuf_max(struct class_device *cd,
67391 +                              const char *buf, size_t len)
67392 +{
67393 +       struct net_device *netdev = container_of(cd, struct net_device,
67394 +                                                class_dev);
67395 +       struct netfront_info *np = netdev_priv(netdev);
67396 +       char *endp;
67397 +       unsigned long target;
67398 +
67399 +       if (!capable(CAP_NET_ADMIN))
67400 +               return -EPERM;
67401 +
67402 +       target = simple_strtoul(buf, &endp, 0);
67403 +       if (endp == buf)
67404 +               return -EBADMSG;
67405 +
67406 +       if (target < RX_MIN_TARGET)
67407 +               target = RX_MIN_TARGET;
67408 +       if (target > RX_MAX_TARGET)
67409 +               target = RX_MAX_TARGET;
67410 +
67411 +       spin_lock(&np->rx_lock);
67412 +       if (target < np->rx_min_target)
67413 +               np->rx_min_target = target;
67414 +       np->rx_max_target = target;
67415 +       if (target < np->rx_target)
67416 +               np->rx_target = target;
67417 +
67418 +       network_alloc_rx_buffers(netdev);
67419 +
67420 +       spin_unlock(&np->rx_lock);
67421 +       return len;
67422 +}
67423 +
67424 +static ssize_t show_rxbuf_cur(struct class_device *cd, char *buf)
67425 +{
67426 +       struct net_device *netdev = container_of(cd, struct net_device,
67427 +                                                class_dev);
67428 +       struct netfront_info *info = netdev_priv(netdev);
67429 +
67430 +       return sprintf(buf, "%u\n", info->rx_target);
67431 +}
67432 +
67433 +static const struct class_device_attribute xennet_attrs[] = {
67434 +       __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
67435 +       __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
67436 +       __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
67437 +};
67438 +
67439 +static int xennet_sysfs_addif(struct net_device *netdev)
67440 +{
67441 +       int i;
67442 +       int error = 0;
67443 +
67444 +       for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
67445 +               error = class_device_create_file(&netdev->class_dev, 
67446 +                                                &xennet_attrs[i]);
67447 +               if (error)
67448 +                       goto fail;
67449 +       }
67450 +       return 0;
67451 +
67452 + fail:
67453 +       while (--i >= 0)
67454 +               class_device_remove_file(&netdev->class_dev,
67455 +                                        &xennet_attrs[i]);
67456 +       return error;
67457 +}
67458 +
67459 +static void xennet_sysfs_delif(struct net_device *netdev)
67460 +{
67461 +       int i;
67462 +
67463 +       for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
67464 +               class_device_remove_file(&netdev->class_dev,
67465 +                                        &xennet_attrs[i]);
67466 +       }
67467 +}
67468 +
67469 +#endif /* CONFIG_SYSFS */
67470 +
67471 +
67472 +/*
67473 + * Nothing to do here. Virtual interface is point-to-point and the
67474 + * physical interface is probably promiscuous anyway.
67475 + */
67476 +static void network_set_multicast_list(struct net_device *dev)
67477 +{
67478 +}
67479 +
67480 +static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
67481 +{
67482 +       int i, err = 0;
67483 +       struct net_device *netdev = NULL;
67484 +       struct netfront_info *np = NULL;
67485 +
67486 +       netdev = alloc_etherdev(sizeof(struct netfront_info));
67487 +       if (!netdev) {
67488 +               printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
67489 +                      __FUNCTION__);
67490 +               return ERR_PTR(-ENOMEM);
67491 +       }
67492 +
67493 +       np                   = netdev_priv(netdev);
67494 +       np->xbdev            = dev;
67495 +
67496 +       netif_carrier_off(netdev);
67497 +
67498 +       spin_lock_init(&np->tx_lock);
67499 +       spin_lock_init(&np->rx_lock);
67500 +
67501 +       skb_queue_head_init(&np->rx_batch);
67502 +       np->rx_target     = RX_DFL_MIN_TARGET;
67503 +       np->rx_min_target = RX_DFL_MIN_TARGET;
67504 +       np->rx_max_target = RX_MAX_TARGET;
67505 +
67506 +       init_timer(&np->rx_refill_timer);
67507 +       np->rx_refill_timer.data = (unsigned long)netdev;
67508 +       np->rx_refill_timer.function = rx_refill_timeout;
67509 +
67510 +       /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
67511 +       for (i = 0; i <= NET_TX_RING_SIZE; i++) {
67512 +               np->tx_skbs[i] = (void *)((unsigned long) i+1);
67513 +               np->grant_tx_ref[i] = GRANT_INVALID_REF;
67514 +       }
67515 +
67516 +       for (i = 0; i < NET_RX_RING_SIZE; i++) {
67517 +               np->rx_skbs[i] = NULL;
67518 +               np->grant_rx_ref[i] = GRANT_INVALID_REF;
67519 +       }
67520 +
67521 +       /* A grant for every tx ring slot */
67522 +       if (gnttab_alloc_grant_references(TX_MAX_TARGET,
67523 +                                         &np->gref_tx_head) < 0) {
67524 +               printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
67525 +               err = -ENOMEM;
67526 +               goto exit;
67527 +       }
67528 +       /* A grant for every rx ring slot */
67529 +       if (gnttab_alloc_grant_references(RX_MAX_TARGET,
67530 +                                         &np->gref_rx_head) < 0) {
67531 +               printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
67532 +               err = -ENOMEM;
67533 +               goto exit_free_tx;
67534 +       }
67535 +
67536 +       netdev->open            = network_open;
67537 +       netdev->hard_start_xmit = network_start_xmit;
67538 +       netdev->stop            = network_close;
67539 +       netdev->get_stats       = network_get_stats;
67540 +       netdev->poll            = netif_poll;
67541 +       netdev->set_multicast_list = network_set_multicast_list;
67542 +       netdev->uninit          = netif_uninit;
67543 +       netdev->change_mtu      = xennet_change_mtu;
67544 +       netdev->weight          = 64;
67545 +       netdev->features        = NETIF_F_IP_CSUM;
67546 +
67547 +       SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
67548 +       SET_MODULE_OWNER(netdev);
67549 +       SET_NETDEV_DEV(netdev, &dev->dev);
67550 +
67551 +       np->netdev = netdev;
67552 +       return netdev;
67553 +
67554 + exit_free_tx:
67555 +       gnttab_free_grant_references(np->gref_tx_head);
67556 + exit:
67557 +       free_netdev(netdev);
67558 +       return ERR_PTR(err);
67559 +}
67560 +
67561 +/*
67562 + * We use this notifier to send out a fake ARP reply to reset switches and
67563 + * router ARP caches when an IP interface is brought up on a VIF.
67564 + */
67565 +static int
67566 +inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
67567 +{
67568 +       struct in_ifaddr  *ifa = (struct in_ifaddr *)ptr;
67569 +       struct net_device *dev = ifa->ifa_dev->dev;
67570 +
67571 +       /* UP event and is it one of our devices? */
67572 +       if (event == NETDEV_UP && dev->open == network_open)
67573 +               (void)send_fake_arp(dev);
67574 +
67575 +       return NOTIFY_DONE;
67576 +}
67577 +
67578 +
67579 +static void netif_disconnect_backend(struct netfront_info *info)
67580 +{
67581 +       /* Stop old i/f to prevent errors whilst we rebuild the state. */
67582 +       spin_lock_irq(&info->tx_lock);
67583 +       spin_lock(&info->rx_lock);
67584 +       netif_carrier_off(info->netdev);
67585 +       spin_unlock(&info->rx_lock);
67586 +       spin_unlock_irq(&info->tx_lock);
67587 +
67588 +       if (info->irq)
67589 +               unbind_from_irqhandler(info->irq, info->netdev);
67590 +       info->evtchn = info->irq = 0;
67591 +
67592 +       end_access(info->tx_ring_ref, info->tx.sring);
67593 +       end_access(info->rx_ring_ref, info->rx.sring);
67594 +       info->tx_ring_ref = GRANT_INVALID_REF;
67595 +       info->rx_ring_ref = GRANT_INVALID_REF;
67596 +       info->tx.sring = NULL;
67597 +       info->rx.sring = NULL;
67598 +}
67599 +
67600 +
67601 +static void end_access(int ref, void *page)
67602 +{
67603 +       if (ref != GRANT_INVALID_REF)
67604 +               gnttab_end_foreign_access(ref, 0, (unsigned long)page);
67605 +}
67606 +
67607 +
67608 +/* ** Driver registration ** */
67609 +
67610 +
67611 +static struct xenbus_device_id netfront_ids[] = {
67612 +       { "vif" },
67613 +       { "" }
67614 +};
67615 +
67616 +
67617 +static struct xenbus_driver netfront = {
67618 +       .name = "vif",
67619 +       .owner = THIS_MODULE,
67620 +       .ids = netfront_ids,
67621 +       .probe = netfront_probe,
67622 +       .remove = __devexit_p(netfront_remove),
67623 +       .resume = netfront_resume,
67624 +       .otherend_changed = backend_changed,
67625 +};
67626 +
67627 +
67628 +static struct notifier_block notifier_inetdev = {
67629 +       .notifier_call  = inetdev_notify,
67630 +       .next           = NULL,
67631 +       .priority       = 0
67632 +};
67633 +
67634 +static int __init netif_init(void)
67635 +{
67636 +       if (!is_running_on_xen())
67637 +               return -ENODEV;
67638 +
67639 +#ifdef CONFIG_XEN
67640 +       if (MODPARM_rx_flip && MODPARM_rx_copy) {
67641 +               WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
67642 +               return -EINVAL;
67643 +       }
67644 +
67645 +       if (!MODPARM_rx_flip && !MODPARM_rx_copy)
67646 +               MODPARM_rx_flip = 1; /* Default is to flip. */
67647 +#endif
67648 +
67649 +       if (is_initial_xendomain())
67650 +               return 0;
67651 +
67652 +       IPRINTK("Initialising virtual ethernet driver.\n");
67653 +
67654 +       (void)register_inetaddr_notifier(&notifier_inetdev);
67655 +
67656 +       return xenbus_register_frontend(&netfront);
67657 +}
67658 +module_init(netif_init);
67659 +
67660 +
67661 +static void __exit netif_exit(void)
67662 +{
67663 +       if (is_initial_xendomain())
67664 +               return;
67665 +
67666 +       unregister_inetaddr_notifier(&notifier_inetdev);
67667 +
67668 +       return xenbus_unregister_driver(&netfront);
67669 +}
67670 +module_exit(netif_exit);
67671 +
67672 +MODULE_LICENSE("Dual BSD/GPL");
67673 diff -ruNp linux-2.6.19/drivers/xen/pciback/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/pciback/Makefile
67674 --- linux-2.6.19/drivers/xen/pciback/Makefile   1970-01-01 00:00:00.000000000 +0000
67675 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/Makefile 2007-02-02 19:10:45.000000000 +0000
67676 @@ -0,0 +1,15 @@
67677 +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
67678 +
67679 +pciback-y := pci_stub.o pciback_ops.o xenbus.o
67680 +pciback-y += conf_space.o conf_space_header.o \
67681 +            conf_space_capability.o \
67682 +            conf_space_capability_vpd.o \
67683 +            conf_space_capability_pm.o \
67684 +             conf_space_quirks.o
67685 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
67686 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
67687 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
67688 +
67689 +ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
67690 +EXTRA_CFLAGS += -DDEBUG
67691 +endif
67692 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space.c
67693 --- linux-2.6.19/drivers/xen/pciback/conf_space.c       1970-01-01 00:00:00.000000000 +0000
67694 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space.c     2007-02-02 19:10:45.000000000 +0000
67695 @@ -0,0 +1,425 @@
67696 +/*
67697 + * PCI Backend - Functions for creating a virtual configuration space for
67698 + *               exported PCI Devices.
67699 + *               It's dangerous to allow PCI Driver Domains to change their
67700 + *               device's resources (memory, i/o ports, interrupts). We need to
67701 + *               restrict changes to certain PCI Configuration registers:
67702 + *               BARs, INTERRUPT_PIN, most registers in the header...
67703 + *
67704 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
67705 + */
67706 +
67707 +#include <linux/kernel.h>
67708 +#include <linux/pci.h>
67709 +#include "pciback.h"
67710 +#include "conf_space.h"
67711 +#include "conf_space_quirks.h"
67712 +
67713 +#define DEFINE_PCI_CONFIG(op,size,type)                        \
67714 +int pciback_##op##_config_##size                               \
67715 +(struct pci_dev *dev, int offset, type value, void *data)      \
67716 +{                                                              \
67717 +       return pci_##op##_config_##size (dev, offset, value);   \
67718 +}
67719 +
67720 +DEFINE_PCI_CONFIG(read, byte, u8 *)
67721 +DEFINE_PCI_CONFIG(read, word, u16 *)
67722 +DEFINE_PCI_CONFIG(read, dword, u32 *)
67723 +
67724 +DEFINE_PCI_CONFIG(write, byte, u8)
67725 +DEFINE_PCI_CONFIG(write, word, u16)
67726 +DEFINE_PCI_CONFIG(write, dword, u32)
67727 +
67728 +static int conf_space_read(struct pci_dev *dev,
67729 +                          struct config_field_entry *entry, int offset,
67730 +                          u32 * value)
67731 +{
67732 +       int ret = 0;
67733 +       struct config_field *field = entry->field;
67734 +
67735 +       *value = 0;
67736 +
67737 +       switch (field->size) {
67738 +       case 1:
67739 +               if (field->u.b.read)
67740 +                       ret = field->u.b.read(dev, offset, (u8 *) value,
67741 +                                             entry->data);
67742 +               break;
67743 +       case 2:
67744 +               if (field->u.w.read)
67745 +                       ret = field->u.w.read(dev, offset, (u16 *) value,
67746 +                                             entry->data);
67747 +               break;
67748 +       case 4:
67749 +               if (field->u.dw.read)
67750 +                       ret = field->u.dw.read(dev, offset, value, entry->data);
67751 +               break;
67752 +       }
67753 +       return ret;
67754 +}
67755 +
67756 +static int conf_space_write(struct pci_dev *dev,
67757 +                           struct config_field_entry *entry, int offset,
67758 +                           u32 value)
67759 +{
67760 +       int ret = 0;
67761 +       struct config_field *field = entry->field;
67762 +
67763 +       switch (field->size) {
67764 +       case 1:
67765 +               if (field->u.b.write)
67766 +                       ret = field->u.b.write(dev, offset, (u8) value,
67767 +                                              entry->data);
67768 +               break;
67769 +       case 2:
67770 +               if (field->u.w.write)
67771 +                       ret = field->u.w.write(dev, offset, (u16) value,
67772 +                                              entry->data);
67773 +               break;
67774 +       case 4:
67775 +               if (field->u.dw.write)
67776 +                       ret = field->u.dw.write(dev, offset, value,
67777 +                                               entry->data);
67778 +               break;
67779 +       }
67780 +       return ret;
67781 +}
67782 +
67783 +static inline u32 get_mask(int size)
67784 +{
67785 +       if (size == 1)
67786 +               return 0xff;
67787 +       else if (size == 2)
67788 +               return 0xffff;
67789 +       else
67790 +               return 0xffffffff;
67791 +}
67792 +
67793 +static inline int valid_request(int offset, int size)
67794 +{
67795 +       /* Validate request (no un-aligned requests) */
67796 +       if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
67797 +               return 1;
67798 +       return 0;
67799 +}
67800 +
67801 +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
67802 +                             int offset)
67803 +{
67804 +       if (offset >= 0) {
67805 +               new_val_mask <<= (offset * 8);
67806 +               new_val <<= (offset * 8);
67807 +       } else {
67808 +               new_val_mask >>= (offset * -8);
67809 +               new_val >>= (offset * -8);
67810 +       }
67811 +       val = (val & ~new_val_mask) | (new_val & new_val_mask);
67812 +
67813 +       return val;
67814 +}
67815 +
67816 +static int pcibios_err_to_errno(int err)
67817 +{
67818 +       switch (err) {
67819 +       case PCIBIOS_SUCCESSFUL:
67820 +               return XEN_PCI_ERR_success;
67821 +       case PCIBIOS_DEVICE_NOT_FOUND:
67822 +               return XEN_PCI_ERR_dev_not_found;
67823 +       case PCIBIOS_BAD_REGISTER_NUMBER:
67824 +               return XEN_PCI_ERR_invalid_offset;
67825 +       case PCIBIOS_FUNC_NOT_SUPPORTED:
67826 +               return XEN_PCI_ERR_not_implemented;
67827 +       case PCIBIOS_SET_FAILED:
67828 +               return XEN_PCI_ERR_access_denied;
67829 +       }
67830 +       return err;
67831 +}
67832 +
67833 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
67834 +                       u32 * ret_val)
67835 +{
67836 +       int err = 0;
67837 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
67838 +       struct config_field_entry *cfg_entry;
67839 +       struct config_field *field;
67840 +       int req_start, req_end, field_start, field_end;
67841 +       /* if read fails for any reason, return 0 (as if device didn't respond) */
67842 +       u32 value = 0, tmp_val;
67843 +
67844 +       if (unlikely(verbose_request))
67845 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
67846 +                      pci_name(dev), size, offset);
67847 +
67848 +       if (!valid_request(offset, size)) {
67849 +               err = XEN_PCI_ERR_invalid_offset;
67850 +               goto out;
67851 +       }
67852 +
67853 +       /* Get the real value first, then modify as appropriate */
67854 +       switch (size) {
67855 +       case 1:
67856 +               err = pci_read_config_byte(dev, offset, (u8 *) & value);
67857 +               break;
67858 +       case 2:
67859 +               err = pci_read_config_word(dev, offset, (u16 *) & value);
67860 +               break;
67861 +       case 4:
67862 +               err = pci_read_config_dword(dev, offset, &value);
67863 +               break;
67864 +       }
67865 +
67866 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
67867 +               field = cfg_entry->field;
67868 +
67869 +               req_start = offset;
67870 +               req_end = offset + size;
67871 +               field_start = OFFSET(cfg_entry);
67872 +               field_end = OFFSET(cfg_entry) + field->size;
67873 +
67874 +               if ((req_start >= field_start && req_start < field_end)
67875 +                   || (req_end > field_start && req_end <= field_end)) {
67876 +                       err = conf_space_read(dev, cfg_entry, field_start,
67877 +                                             &tmp_val);
67878 +                       if (err)
67879 +                               goto out;
67880 +
67881 +                       value = merge_value(value, tmp_val,
67882 +                                           get_mask(field->size),
67883 +                                           field_start - req_start);
67884 +               }
67885 +       }
67886 +
67887 +      out:
67888 +       if (unlikely(verbose_request))
67889 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
67890 +                      pci_name(dev), size, offset, value);
67891 +
67892 +       *ret_val = value;
67893 +       return pcibios_err_to_errno(err);
67894 +}
67895 +
67896 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
67897 +{
67898 +       int err = 0, handled = 0;
67899 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
67900 +       struct config_field_entry *cfg_entry;
67901 +       struct config_field *field;
67902 +       u32 tmp_val;
67903 +       int req_start, req_end, field_start, field_end;
67904 +
67905 +       if (unlikely(verbose_request))
67906 +               printk(KERN_DEBUG
67907 +                      "pciback: %s: write request %d bytes at 0x%x = %x\n",
67908 +                      pci_name(dev), size, offset, value);
67909 +
67910 +       if (!valid_request(offset, size))
67911 +               return XEN_PCI_ERR_invalid_offset;
67912 +
67913 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
67914 +               field = cfg_entry->field;
67915 +
67916 +               req_start = offset;
67917 +               req_end = offset + size;
67918 +               field_start = OFFSET(cfg_entry);
67919 +               field_end = OFFSET(cfg_entry) + field->size;
67920 +
67921 +               if ((req_start >= field_start && req_start < field_end)
67922 +                   || (req_end > field_start && req_end <= field_end)) {
67923 +                       tmp_val = 0;
67924 +
67925 +                       err = pciback_config_read(dev, field_start,
67926 +                                                 field->size, &tmp_val);
67927 +                       if (err)
67928 +                               break;
67929 +
67930 +                       tmp_val = merge_value(tmp_val, value, get_mask(size),
67931 +                                             req_start - field_start);
67932 +
67933 +                       err = conf_space_write(dev, cfg_entry, field_start,
67934 +                                              tmp_val);
67935 +
67936 +                       /* handled is set true here, but not every byte
67937 +                        * may have been written! Properly detecting if
67938 +                        * every byte is handled is unnecessary as the
67939 +                        * flag is used to detect devices that need
67940 +                        * special helpers to work correctly.
67941 +                        */
67942 +                       handled = 1;
67943 +               }
67944 +       }
67945 +
67946 +       if (!handled && !err) {
67947 +               /* By default, anything not specificially handled above is
67948 +                * read-only. The permissive flag changes this behavior so
67949 +                * that anything not specifically handled above is writable.
67950 +                * This means that some fields may still be read-only because
67951 +                * they have entries in the config_field list that intercept
67952 +                * the write and do nothing. */
67953 +               if (dev_data->permissive) {
67954 +                       switch (size) {
67955 +                       case 1:
67956 +                               err = pci_write_config_byte(dev, offset,
67957 +                                                           (u8) value);
67958 +                               break;
67959 +                       case 2:
67960 +                               err = pci_write_config_word(dev, offset,
67961 +                                                           (u16) value);
67962 +                               break;
67963 +                       case 4:
67964 +                               err = pci_write_config_dword(dev, offset,
67965 +                                                            (u32) value);
67966 +                               break;
67967 +                       }
67968 +               } else if (!dev_data->warned_on_write) {
67969 +                       dev_data->warned_on_write = 1;
67970 +                       dev_warn(&dev->dev, "Driver tried to write to a "
67971 +                                "read-only configuration space field at offset "
67972 +                                "0x%x, size %d. This may be harmless, but if "
67973 +                                "you have problems with your device:\n"
67974 +                                "1) see permissive attribute in sysfs\n"
67975 +                                "2) report problems to the xen-devel "
67976 +                                "mailing list along with details of your "
67977 +                                "device obtained from lspci.\n", offset, size);
67978 +               }
67979 +       }
67980 +
67981 +       return pcibios_err_to_errno(err);
67982 +}
67983 +
67984 +void pciback_config_free_dyn_fields(struct pci_dev *dev)
67985 +{
67986 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
67987 +       struct config_field_entry *cfg_entry, *t;
67988 +       struct config_field *field;
67989 +
67990 +       dev_dbg(&dev->dev,
67991 +               "free-ing dynamically allocated virtual configuration space fields\n");
67992 +
67993 +       list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
67994 +               field = cfg_entry->field;
67995 +
67996 +               if (field->clean) {
67997 +                       field->clean(field);
67998 +
67999 +                       if (cfg_entry->data)
68000 +                               kfree(cfg_entry->data);
68001 +
68002 +                       list_del(&cfg_entry->list);
68003 +                       kfree(cfg_entry);
68004 +               }
68005 +
68006 +       }
68007 +}
68008 +
68009 +void pciback_config_reset_dev(struct pci_dev *dev)
68010 +{
68011 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
68012 +       struct config_field_entry *cfg_entry;
68013 +       struct config_field *field;
68014 +
68015 +       dev_dbg(&dev->dev, "resetting virtual configuration space\n");
68016 +
68017 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
68018 +               field = cfg_entry->field;
68019 +
68020 +               if (field->reset)
68021 +                       field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
68022 +       }
68023 +}
68024 +
68025 +void pciback_config_free_dev(struct pci_dev *dev)
68026 +{
68027 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
68028 +       struct config_field_entry *cfg_entry, *t;
68029 +       struct config_field *field;
68030 +
68031 +       dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
68032 +
68033 +       list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
68034 +               list_del(&cfg_entry->list);
68035 +
68036 +               field = cfg_entry->field;
68037 +
68038 +               if (field->release)
68039 +                       field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
68040 +
68041 +               kfree(cfg_entry);
68042 +       }
68043 +}
68044 +
68045 +int pciback_config_add_field_offset(struct pci_dev *dev,
68046 +                                   struct config_field *field,
68047 +                                   unsigned int offset)
68048 +{
68049 +       int err = 0;
68050 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
68051 +       struct config_field_entry *cfg_entry;
68052 +       void *tmp;
68053 +
68054 +       /* silently ignore duplicate fields */
68055 +       if (pciback_field_is_dup(dev, field->offset))
68056 +               goto out;
68057 +
68058 +       cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
68059 +       if (!cfg_entry) {
68060 +               err = -ENOMEM;
68061 +               goto out;
68062 +       }
68063 +
68064 +       cfg_entry->data = NULL;
68065 +       cfg_entry->field = field;
68066 +       cfg_entry->base_offset = offset;
68067 +
68068 +       if (field->init) {
68069 +               tmp = field->init(dev, OFFSET(cfg_entry));
68070 +
68071 +               if (IS_ERR(tmp)) {
68072 +                       err = PTR_ERR(tmp);
68073 +                       goto out;
68074 +               }
68075 +
68076 +               cfg_entry->data = tmp;
68077 +       }
68078 +
68079 +       dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
68080 +               OFFSET(cfg_entry));
68081 +       list_add_tail(&cfg_entry->list, &dev_data->config_fields);
68082 +
68083 +      out:
68084 +       if (err)
68085 +               kfree(cfg_entry);
68086 +
68087 +       return err;
68088 +}
68089 +
68090 +/* This sets up the device's virtual configuration space to keep track of 
68091 + * certain registers (like the base address registers (BARs) so that we can
68092 + * keep the client from manipulating them directly.
68093 + */
68094 +int pciback_config_init_dev(struct pci_dev *dev)
68095 +{
68096 +       int err = 0;
68097 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
68098 +
68099 +       dev_dbg(&dev->dev, "initializing virtual configuration space\n");
68100 +
68101 +       INIT_LIST_HEAD(&dev_data->config_fields);
68102 +
68103 +       err = pciback_config_header_add_fields(dev);
68104 +       if (err)
68105 +               goto out;
68106 +
68107 +       err = pciback_config_capability_add_fields(dev);
68108 +       if (err)
68109 +               goto out;
68110 +
68111 +       err = pciback_config_quirks_init(dev);
68112 +
68113 +      out:
68114 +       return err;
68115 +}
68116 +
68117 +int pciback_config_init(void)
68118 +{
68119 +       return pciback_config_capability_init();
68120 +}
68121 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space.h linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space.h
68122 --- linux-2.6.19/drivers/xen/pciback/conf_space.h       1970-01-01 00:00:00.000000000 +0000
68123 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space.h     2007-02-02 19:10:45.000000000 +0000
68124 @@ -0,0 +1,126 @@
68125 +/*
68126 + * PCI Backend - Common data structures for overriding the configuration space
68127 + *
68128 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68129 + */
68130 +
68131 +#ifndef __XEN_PCIBACK_CONF_SPACE_H__
68132 +#define __XEN_PCIBACK_CONF_SPACE_H__
68133 +
68134 +#include <linux/list.h>
68135 +#include <linux/err.h>
68136 +
68137 +/* conf_field_init can return an errno in a ptr with ERR_PTR() */
68138 +typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
68139 +typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
68140 +typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
68141 +
68142 +typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
68143 +                                void *data);
68144 +typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
68145 +                               void *data);
68146 +typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
68147 +                               void *data);
68148 +typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
68149 +                               void *data);
68150 +typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
68151 +                              void *data);
68152 +typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
68153 +                              void *data);
68154 +
68155 +/* These are the fields within the configuration space which we
68156 + * are interested in intercepting reads/writes to and changing their
68157 + * values.
68158 + */
68159 +struct config_field {
68160 +       unsigned int offset;
68161 +       unsigned int size;
68162 +       unsigned int mask;
68163 +       conf_field_init init;
68164 +       conf_field_reset reset;
68165 +       conf_field_free release;
68166 +       void (*clean) (struct config_field * field);
68167 +       union {
68168 +               struct {
68169 +                       conf_dword_write write;
68170 +                       conf_dword_read read;
68171 +               } dw;
68172 +               struct {
68173 +                       conf_word_write write;
68174 +                       conf_word_read read;
68175 +               } w;
68176 +               struct {
68177 +                       conf_byte_write write;
68178 +                       conf_byte_read read;
68179 +               } b;
68180 +       } u;
68181 +       struct list_head list;
68182 +};
68183 +
68184 +struct config_field_entry {
68185 +       struct list_head list;
68186 +       struct config_field *field;
68187 +       unsigned int base_offset;
68188 +       void *data;
68189 +};
68190 +
68191 +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
68192 +
68193 +/* Add fields to a device - the add_fields macro expects to get a pointer to
68194 + * the first entry in an array (of which the ending is marked by size==0)
68195 + */
68196 +int pciback_config_add_field_offset(struct pci_dev *dev,
68197 +                                   struct config_field *field,
68198 +                                   unsigned int offset);
68199 +
68200 +static inline int pciback_config_add_field(struct pci_dev *dev,
68201 +                                          struct config_field *field)
68202 +{
68203 +       return pciback_config_add_field_offset(dev, field, 0);
68204 +}
68205 +
68206 +static inline int pciback_config_add_fields(struct pci_dev *dev,
68207 +                                           struct config_field *field)
68208 +{
68209 +       int i, err = 0;
68210 +       for (i = 0; field[i].size != 0; i++) {
68211 +               err = pciback_config_add_field(dev, &field[i]);
68212 +               if (err)
68213 +                       break;
68214 +       }
68215 +       return err;
68216 +}
68217 +
68218 +static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
68219 +                                                  struct config_field *field,
68220 +                                                  unsigned int offset)
68221 +{
68222 +       int i, err = 0;
68223 +       for (i = 0; field[i].size != 0; i++) {
68224 +               err = pciback_config_add_field_offset(dev, &field[i], offset);
68225 +               if (err)
68226 +                       break;
68227 +       }
68228 +       return err;
68229 +}
68230 +
68231 +/* Read/Write the real configuration space */
68232 +int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
68233 +                            void *data);
68234 +int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
68235 +                            void *data);
68236 +int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
68237 +                             void *data);
68238 +int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
68239 +                             void *data);
68240 +int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
68241 +                             void *data);
68242 +int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
68243 +                              void *data);
68244 +
68245 +int pciback_config_capability_init(void);
68246 +
68247 +int pciback_config_header_add_fields(struct pci_dev *dev);
68248 +int pciback_config_capability_add_fields(struct pci_dev *dev);
68249 +
68250 +#endif                         /* __XEN_PCIBACK_CONF_SPACE_H__ */
68251 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space_capability.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability.c
68252 --- linux-2.6.19/drivers/xen/pciback/conf_space_capability.c    1970-01-01 00:00:00.000000000 +0000
68253 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability.c  2007-02-02 19:10:45.000000000 +0000
68254 @@ -0,0 +1,71 @@
68255 +/*
68256 + * PCI Backend - Handles the virtual fields found on the capability lists
68257 + *               in the configuration space.
68258 + *
68259 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68260 + */
68261 +
68262 +#include <linux/kernel.h>
68263 +#include <linux/pci.h>
68264 +#include "pciback.h"
68265 +#include "conf_space.h"
68266 +#include "conf_space_capability.h"
68267 +
68268 +static LIST_HEAD(capabilities);
68269 +
68270 +static struct config_field caplist_header[] = {
68271 +       {
68272 +        .offset    = PCI_CAP_LIST_ID,
68273 +        .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
68274 +        .u.w.read  = pciback_read_config_word,
68275 +        .u.w.write = NULL,
68276 +       },
68277 +       {
68278 +        .size = 0,
68279 +       },
68280 +};
68281 +
68282 +static inline void register_capability(struct pciback_config_capability *cap)
68283 +{
68284 +       list_add_tail(&cap->cap_list, &capabilities);
68285 +}
68286 +
68287 +int pciback_config_capability_add_fields(struct pci_dev *dev)
68288 +{
68289 +       int err = 0;
68290 +       struct pciback_config_capability *cap;
68291 +       int cap_offset;
68292 +
68293 +       list_for_each_entry(cap, &capabilities, cap_list) {
68294 +               cap_offset = pci_find_capability(dev, cap->capability);
68295 +               if (cap_offset) {
68296 +                       dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
68297 +                               cap->capability, cap_offset);
68298 +
68299 +                       err = pciback_config_add_fields_offset(dev,
68300 +                                                              caplist_header,
68301 +                                                              cap_offset);
68302 +                       if (err)
68303 +                               goto out;
68304 +                       err = pciback_config_add_fields_offset(dev,
68305 +                                                              cap->fields,
68306 +                                                              cap_offset);
68307 +                       if (err)
68308 +                               goto out;
68309 +               }
68310 +       }
68311 +
68312 +      out:
68313 +       return err;
68314 +}
68315 +
68316 +extern struct pciback_config_capability pciback_config_capability_vpd;
68317 +extern struct pciback_config_capability pciback_config_capability_pm;
68318 +
68319 +int pciback_config_capability_init(void)
68320 +{
68321 +       register_capability(&pciback_config_capability_vpd);
68322 +       register_capability(&pciback_config_capability_pm);
68323 +
68324 +       return 0;
68325 +}
68326 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space_capability.h linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability.h
68327 --- linux-2.6.19/drivers/xen/pciback/conf_space_capability.h    1970-01-01 00:00:00.000000000 +0000
68328 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability.h  2007-02-02 19:10:45.000000000 +0000
68329 @@ -0,0 +1,23 @@
68330 +/*
68331 + * PCI Backend - Data structures for special overlays for structures on
68332 + *               the capability list.
68333 + *
68334 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68335 + */
68336 +
68337 +#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
68338 +#define __PCIBACK_CONFIG_CAPABILITY_H__
68339 +
68340 +#include <linux/pci.h>
68341 +#include <linux/list.h>
68342 +
68343 +struct pciback_config_capability {
68344 +       struct list_head cap_list;
68345 +
68346 +       int capability;
68347 +
68348 +       /* If the device has the capability found above, add these fields */
68349 +       struct config_field *fields;
68350 +};
68351 +
68352 +#endif
68353 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space_capability_pm.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability_pm.c
68354 --- linux-2.6.19/drivers/xen/pciback/conf_space_capability_pm.c 1970-01-01 00:00:00.000000000 +0000
68355 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability_pm.c       2007-02-02 19:10:45.000000000 +0000
68356 @@ -0,0 +1,113 @@
68357 +/*
68358 + * PCI Backend - Configuration space overlay for power management
68359 + *
68360 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68361 + */
68362 +
68363 +#include <linux/pci.h>
68364 +#include "conf_space.h"
68365 +#include "conf_space_capability.h"
68366 +
68367 +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
68368 +                       void *data)
68369 +{
68370 +       int err;
68371 +       u16 real_value;
68372 +
68373 +       err = pci_read_config_word(dev, offset, &real_value);
68374 +       if (err)
68375 +               goto out;
68376 +
68377 +       *value = real_value & ~PCI_PM_CAP_PME_MASK;
68378 +
68379 +      out:
68380 +       return err;
68381 +}
68382 +
68383 +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
68384 + * Can't allow driver domain to enable PMEs - they're shared */
68385 +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
68386 +
68387 +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
68388 +                        void *data)
68389 +{
68390 +       int err;
68391 +       u16 cur_value;
68392 +       pci_power_t new_state;
68393 +
68394 +       /* Handle setting power state separately */
68395 +       new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
68396 +
68397 +       err = pci_read_config_word(dev, offset, &cur_value);
68398 +       if (err)
68399 +               goto out;
68400 +
68401 +       new_value &= PM_OK_BITS;
68402 +       if ((cur_value & PM_OK_BITS) != new_value) {
68403 +               new_value = (cur_value & ~PM_OK_BITS) | new_value;
68404 +               err = pci_write_config_word(dev, offset, new_value);
68405 +               if (err)
68406 +                       goto out;
68407 +       }
68408 +
68409 +       /* Let pci core handle the power management change */
68410 +       dev_dbg(&dev->dev, "set power state to %x\n", new_state);
68411 +       err = pci_set_power_state(dev, new_state);
68412 +       if (err)
68413 +               err = PCIBIOS_SET_FAILED;
68414 +
68415 +      out:
68416 +       return err;
68417 +}
68418 +
68419 +/* Ensure PMEs are disabled */
68420 +static void *pm_ctrl_init(struct pci_dev *dev, int offset)
68421 +{
68422 +       int err;
68423 +       u16 value;
68424 +
68425 +       err = pci_read_config_word(dev, offset, &value);
68426 +       if (err)
68427 +               goto out;
68428 +
68429 +       if (value & PCI_PM_CTRL_PME_ENABLE) {
68430 +               value &= ~PCI_PM_CTRL_PME_ENABLE;
68431 +               err = pci_write_config_word(dev, offset, value);
68432 +       }
68433 +
68434 +      out:
68435 +       return ERR_PTR(err);
68436 +}
68437 +
68438 +static struct config_field caplist_pm[] = {
68439 +       {
68440 +               .offset     = PCI_PM_PMC,
68441 +               .size       = 2,
68442 +               .u.w.read   = pm_caps_read,
68443 +       },
68444 +       {
68445 +               .offset     = PCI_PM_CTRL,
68446 +               .size       = 2,
68447 +               .init       = pm_ctrl_init,
68448 +               .u.w.read   = pciback_read_config_word,
68449 +               .u.w.write  = pm_ctrl_write,
68450 +       },
68451 +       {
68452 +               .offset     = PCI_PM_PPB_EXTENSIONS,
68453 +               .size       = 1,
68454 +               .u.b.read   = pciback_read_config_byte,
68455 +       },
68456 +       {
68457 +               .offset     = PCI_PM_DATA_REGISTER,
68458 +               .size       = 1,
68459 +               .u.b.read   = pciback_read_config_byte,
68460 +       },
68461 +       {
68462 +               .size = 0,
68463 +       },
68464 +};
68465 +
68466 +struct pciback_config_capability pciback_config_capability_pm = {
68467 +       .capability = PCI_CAP_ID_PM,
68468 +       .fields = caplist_pm,
68469 +};
68470 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space_capability_vpd.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability_vpd.c
68471 --- linux-2.6.19/drivers/xen/pciback/conf_space_capability_vpd.c        1970-01-01 00:00:00.000000000 +0000
68472 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_capability_vpd.c      2007-02-02 19:10:45.000000000 +0000
68473 @@ -0,0 +1,42 @@
68474 +/*
68475 + * PCI Backend - Configuration space overlay for Vital Product Data
68476 + *
68477 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68478 + */
68479 +
68480 +#include <linux/pci.h>
68481 +#include "conf_space.h"
68482 +#include "conf_space_capability.h"
68483 +
68484 +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
68485 +                            void *data)
68486 +{
68487 +       /* Disallow writes to the vital product data */
68488 +       if (value & PCI_VPD_ADDR_F)
68489 +               return PCIBIOS_SET_FAILED;
68490 +       else
68491 +               return pci_write_config_word(dev, offset, value);
68492 +}
68493 +
68494 +static struct config_field caplist_vpd[] = {
68495 +       {
68496 +        .offset    = PCI_VPD_ADDR,
68497 +        .size      = 2,
68498 +        .u.w.read  = pciback_read_config_word,
68499 +        .u.w.write = vpd_address_write,
68500 +        },
68501 +       {
68502 +        .offset     = PCI_VPD_DATA,
68503 +        .size       = 4,
68504 +        .u.dw.read  = pciback_read_config_dword,
68505 +        .u.dw.write = NULL,
68506 +        },
68507 +       {
68508 +        .size = 0,
68509 +        },
68510 +};
68511
68512 +struct pciback_config_capability pciback_config_capability_vpd = {
68513 +       .capability = PCI_CAP_ID_VPD,
68514 +       .fields = caplist_vpd,
68515 +};
68516 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space_header.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_header.c
68517 --- linux-2.6.19/drivers/xen/pciback/conf_space_header.c        1970-01-01 00:00:00.000000000 +0000
68518 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_header.c      2007-02-02 19:10:45.000000000 +0000
68519 @@ -0,0 +1,299 @@
68520 +/*
68521 + * PCI Backend - Handles the virtual fields in the configuration space headers.
68522 + *
68523 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68524 + */
68525 +
68526 +#include <linux/kernel.h>
68527 +#include <linux/pci.h>
68528 +#include "pciback.h"
68529 +#include "conf_space.h"
68530 +
68531 +struct pci_bar_info {
68532 +       u32 val;
68533 +       u32 len_val;
68534 +       int which;
68535 +};
68536 +
68537 +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
68538 +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
68539 +
68540 +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
68541 +{
68542 +       if (!dev->is_enabled && is_enable_cmd(value)) {
68543 +               if (unlikely(verbose_request))
68544 +                       printk(KERN_DEBUG "pciback: %s: enable\n",
68545 +                              pci_name(dev));
68546 +               pci_enable_device(dev);
68547 +       } else if (dev->is_enabled && !is_enable_cmd(value)) {
68548 +               if (unlikely(verbose_request))
68549 +                       printk(KERN_DEBUG "pciback: %s: disable\n",
68550 +                              pci_name(dev));
68551 +               pci_disable_device(dev);
68552 +       }
68553 +
68554 +       if (!dev->is_busmaster && is_master_cmd(value)) {
68555 +               if (unlikely(verbose_request))
68556 +                       printk(KERN_DEBUG "pciback: %s: set bus master\n",
68557 +                              pci_name(dev));
68558 +               pci_set_master(dev);
68559 +       }
68560 +
68561 +       if (value & PCI_COMMAND_INVALIDATE) {
68562 +               if (unlikely(verbose_request))
68563 +                       printk(KERN_DEBUG
68564 +                              "pciback: %s: enable memory-write-invalidate\n",
68565 +                              pci_name(dev));
68566 +               pci_set_mwi(dev);
68567 +       }
68568 +
68569 +       return pci_write_config_word(dev, offset, value);
68570 +}
68571 +
68572 +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
68573 +{
68574 +       struct pci_bar_info *bar = data;
68575 +
68576 +       if (unlikely(!bar)) {
68577 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
68578 +                      pci_name(dev));
68579 +               return XEN_PCI_ERR_op_failed;
68580 +       }
68581 +
68582 +       /* A write to obtain the length must happen as a 32-bit write.
68583 +        * This does not (yet) support writing individual bytes
68584 +        */
68585 +       if (value == ~PCI_ROM_ADDRESS_ENABLE)
68586 +               bar->which = 1;
68587 +       else
68588 +               bar->which = 0;
68589 +
68590 +       /* Do we need to support enabling/disabling the rom address here? */
68591 +
68592 +       return 0;
68593 +}
68594 +
68595 +/* For the BARs, only allow writes which write ~0 or
68596 + * the correct resource information
68597 + * (Needed for when the driver probes the resource usage)
68598 + */
68599 +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
68600 +{
68601 +       struct pci_bar_info *bar = data;
68602 +
68603 +       if (unlikely(!bar)) {
68604 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
68605 +                      pci_name(dev));
68606 +               return XEN_PCI_ERR_op_failed;
68607 +       }
68608 +
68609 +       /* A write to obtain the length must happen as a 32-bit write.
68610 +        * This does not (yet) support writing individual bytes
68611 +        */
68612 +       if (value == ~0)
68613 +               bar->which = 1;
68614 +       else
68615 +               bar->which = 0;
68616 +
68617 +       return 0;
68618 +}
68619 +
68620 +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
68621 +{
68622 +       struct pci_bar_info *bar = data;
68623 +
68624 +       if (unlikely(!bar)) {
68625 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
68626 +                      pci_name(dev));
68627 +               return XEN_PCI_ERR_op_failed;
68628 +       }
68629 +
68630 +       *value = bar->which ? bar->len_val : bar->val;
68631 +
68632 +       return 0;
68633 +}
68634 +
68635 +static inline void read_dev_bar(struct pci_dev *dev,
68636 +                               struct pci_bar_info *bar_info, int offset,
68637 +                               u32 len_mask)
68638 +{
68639 +       pci_read_config_dword(dev, offset, &bar_info->val);
68640 +       pci_write_config_dword(dev, offset, len_mask);
68641 +       pci_read_config_dword(dev, offset, &bar_info->len_val);
68642 +       pci_write_config_dword(dev, offset, bar_info->val);
68643 +}
68644 +
68645 +static void *bar_init(struct pci_dev *dev, int offset)
68646 +{
68647 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
68648 +
68649 +       if (!bar)
68650 +               return ERR_PTR(-ENOMEM);
68651 +
68652 +       read_dev_bar(dev, bar, offset, ~0);
68653 +       bar->which = 0;
68654 +
68655 +       return bar;
68656 +}
68657 +
68658 +static void *rom_init(struct pci_dev *dev, int offset)
68659 +{
68660 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
68661 +
68662 +       if (!bar)
68663 +               return ERR_PTR(-ENOMEM);
68664 +
68665 +       read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
68666 +       bar->which = 0;
68667 +
68668 +       return bar;
68669 +}
68670 +
68671 +static void bar_reset(struct pci_dev *dev, int offset, void *data)
68672 +{
68673 +       struct pci_bar_info *bar = data;
68674 +
68675 +       bar->which = 0;
68676 +}
68677 +
68678 +static void bar_release(struct pci_dev *dev, int offset, void *data)
68679 +{
68680 +       kfree(data);
68681 +}
68682 +
68683 +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
68684 +                         void *data)
68685 +{
68686 +       *value = (u8) dev->irq;
68687 +
68688 +       return 0;
68689 +}
68690 +
68691 +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
68692 +{
68693 +       u8 cur_value;
68694 +       int err;
68695 +
68696 +       err = pci_read_config_byte(dev, offset, &cur_value);
68697 +       if (err)
68698 +               goto out;
68699 +
68700 +       if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
68701 +           || value == PCI_BIST_START)
68702 +               err = pci_write_config_byte(dev, offset, value);
68703 +
68704 +      out:
68705 +       return err;
68706 +}
68707 +
68708 +static struct config_field header_common[] = {
68709 +       {
68710 +        .offset    = PCI_COMMAND,
68711 +        .size      = 2,
68712 +        .u.w.read  = pciback_read_config_word,
68713 +        .u.w.write = command_write,
68714 +       },
68715 +       {
68716 +        .offset    = PCI_INTERRUPT_LINE,
68717 +        .size      = 1,
68718 +        .u.b.read  = interrupt_read,
68719 +       },
68720 +       {
68721 +        .offset    = PCI_INTERRUPT_PIN,
68722 +        .size      = 1,
68723 +        .u.b.read  = pciback_read_config_byte,
68724 +       },
68725 +       {
68726 +        /* Any side effects of letting driver domain control cache line? */
68727 +        .offset    = PCI_CACHE_LINE_SIZE,
68728 +        .size      = 1,
68729 +        .u.b.read  = pciback_read_config_byte,
68730 +        .u.b.write = pciback_write_config_byte,
68731 +       },
68732 +       {
68733 +        .offset    = PCI_LATENCY_TIMER,
68734 +        .size      = 1,
68735 +        .u.b.read  = pciback_read_config_byte,
68736 +       },
68737 +       {
68738 +        .offset    = PCI_BIST,
68739 +        .size      = 1,
68740 +        .u.b.read  = pciback_read_config_byte,
68741 +        .u.b.write = bist_write,
68742 +       },
68743 +       {
68744 +        .size = 0,
68745 +       },
68746 +};
68747 +
68748 +#define CFG_FIELD_BAR(reg_offset)                      \
68749 +       {                                               \
68750 +        .offset     = reg_offset,                      \
68751 +        .size       = 4,                               \
68752 +        .init       = bar_init,                        \
68753 +        .reset      = bar_reset,                       \
68754 +        .release    = bar_release,                     \
68755 +        .u.dw.read  = bar_read,                        \
68756 +        .u.dw.write = bar_write,                       \
68757 +        }
68758 +
68759 +#define CFG_FIELD_ROM(reg_offset)                      \
68760 +       {                                               \
68761 +        .offset     = reg_offset,                      \
68762 +        .size       = 4,                               \
68763 +        .init       = rom_init,                        \
68764 +        .reset      = bar_reset,                       \
68765 +        .release    = bar_release,                     \
68766 +        .u.dw.read  = bar_read,                        \
68767 +        .u.dw.write = rom_write,                       \
68768 +        }
68769 +
68770 +static struct config_field header_0[] = {
68771 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
68772 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
68773 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
68774 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
68775 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
68776 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
68777 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS),
68778 +       {
68779 +        .size = 0,
68780 +       },
68781 +};
68782 +
68783 +static struct config_field header_1[] = {
68784 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
68785 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
68786 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
68787 +       {
68788 +        .size = 0,
68789 +       },
68790 +};
68791 +
68792 +int pciback_config_header_add_fields(struct pci_dev *dev)
68793 +{
68794 +       int err;
68795 +
68796 +       err = pciback_config_add_fields(dev, header_common);
68797 +       if (err)
68798 +               goto out;
68799 +
68800 +       switch (dev->hdr_type) {
68801 +       case PCI_HEADER_TYPE_NORMAL:
68802 +               err = pciback_config_add_fields(dev, header_0);
68803 +               break;
68804 +
68805 +       case PCI_HEADER_TYPE_BRIDGE:
68806 +               err = pciback_config_add_fields(dev, header_1);
68807 +               break;
68808 +
68809 +       default:
68810 +               err = -EINVAL;
68811 +               printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
68812 +                      pci_name(dev), dev->hdr_type);
68813 +               break;
68814 +       }
68815 +
68816 +      out:
68817 +       return err;
68818 +}
68819 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space_quirks.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_quirks.c
68820 --- linux-2.6.19/drivers/xen/pciback/conf_space_quirks.c        1970-01-01 00:00:00.000000000 +0000
68821 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_quirks.c      2007-02-02 19:10:45.000000000 +0000
68822 @@ -0,0 +1,128 @@
68823 +/*
68824 + * PCI Backend - Handle special overlays for broken devices.
68825 + *
68826 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68827 + * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
68828 + */
68829 +
68830 +#include <linux/kernel.h>
68831 +#include <linux/pci.h>
68832 +#include "pciback.h"
68833 +#include "conf_space.h"
68834 +#include "conf_space_quirks.h"
68835 +
68836 +LIST_HEAD(pciback_quirks);
68837 +
68838 +struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
68839 +{
68840 +       struct pciback_config_quirk *tmp_quirk;
68841 +
68842 +       list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
68843 +           if (pci_match_id(&tmp_quirk->devid, dev))
68844 +               goto out;
68845 +       tmp_quirk = NULL;
68846 +       printk(KERN_DEBUG
68847 +              "quirk didn't match any device pciback knows about\n");
68848 +      out:
68849 +       return tmp_quirk;
68850 +}
68851 +
68852 +static inline void register_quirk(struct pciback_config_quirk *quirk)
68853 +{
68854 +       list_add_tail(&quirk->quirks_list, &pciback_quirks);
68855 +}
68856 +
68857 +int pciback_field_is_dup(struct pci_dev *dev, int reg)
68858 +{
68859 +       int ret = 0;
68860 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
68861 +       struct config_field *field;
68862 +       struct config_field_entry *cfg_entry;
68863 +
68864 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
68865 +               field = cfg_entry->field;
68866 +               if (field->offset == reg) {
68867 +                       ret = 1;
68868 +                       break;
68869 +               }
68870 +       }
68871 +       return ret;
68872 +}
68873 +
68874 +int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
68875 +                                   *field)
68876 +{
68877 +       int err = 0;
68878 +
68879 +       switch (field->size) {
68880 +       case 1:
68881 +               field->u.b.read = pciback_read_config_byte;
68882 +               field->u.b.write = pciback_write_config_byte;
68883 +               break;
68884 +       case 2:
68885 +               field->u.w.read = pciback_read_config_word;
68886 +               field->u.w.write = pciback_write_config_word;
68887 +               break;
68888 +       case 4:
68889 +               field->u.dw.read = pciback_read_config_dword;
68890 +               field->u.dw.write = pciback_write_config_dword;
68891 +               break;
68892 +       default:
68893 +               err = -EINVAL;
68894 +               goto out;
68895 +       }
68896 +
68897 +       pciback_config_add_field(dev, field);
68898 +
68899 +      out:
68900 +       return err;
68901 +}
68902 +
68903 +int pciback_config_quirks_init(struct pci_dev *dev)
68904 +{
68905 +       struct pciback_config_quirk *quirk;
68906 +       int ret = 0;
68907 +
68908 +       quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
68909 +       if (!quirk) {
68910 +               ret = -ENOMEM;
68911 +               goto out;
68912 +       }
68913 +
68914 +       quirk->devid.vendor = dev->vendor;
68915 +       quirk->devid.device = dev->device;
68916 +       quirk->devid.subvendor = dev->subsystem_vendor;
68917 +       quirk->devid.subdevice = dev->subsystem_device;
68918 +       quirk->devid.class = 0;
68919 +       quirk->devid.class_mask = 0;
68920 +       quirk->devid.driver_data = 0UL;
68921 +
68922 +       quirk->pdev = dev;
68923 +
68924 +       register_quirk(quirk);
68925 +      out:
68926 +       return ret;
68927 +}
68928 +
68929 +void pciback_config_field_free(struct config_field *field)
68930 +{
68931 +       kfree(field);
68932 +}
68933 +
68934 +int pciback_config_quirk_release(struct pci_dev *dev)
68935 +{
68936 +       struct pciback_config_quirk *quirk;
68937 +       int ret = 0;
68938 +
68939 +       quirk = pciback_find_quirk(dev);
68940 +       if (!quirk) {
68941 +               ret = -ENXIO;
68942 +               goto out;
68943 +       }
68944 +
68945 +       list_del(&quirk->quirks_list);
68946 +       kfree(quirk);
68947 +
68948 +      out:
68949 +       return ret;
68950 +}
68951 diff -ruNp linux-2.6.19/drivers/xen/pciback/conf_space_quirks.h linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_quirks.h
68952 --- linux-2.6.19/drivers/xen/pciback/conf_space_quirks.h        1970-01-01 00:00:00.000000000 +0000
68953 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/conf_space_quirks.h      2007-02-02 19:10:45.000000000 +0000
68954 @@ -0,0 +1,35 @@
68955 +/*
68956 + * PCI Backend - Data structures for special overlays for broken devices.
68957 + *
68958 + * Ryan Wilson <hap9@epoch.ncsc.mil>
68959 + * Chris Bookholt <hap10@epoch.ncsc.mil>
68960 + */
68961 +
68962 +#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
68963 +#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
68964 +
68965 +#include <linux/pci.h>
68966 +#include <linux/list.h>
68967 +
68968 +struct pciback_config_quirk {
68969 +       struct list_head quirks_list;
68970 +       struct pci_device_id devid;
68971 +       struct pci_dev *pdev;
68972 +};
68973 +
68974 +struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
68975 +
68976 +int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
68977 +                                   *field);
68978 +
68979 +int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
68980 +
68981 +int pciback_config_quirks_init(struct pci_dev *dev);
68982 +
68983 +void pciback_config_field_free(struct config_field *field);
68984 +
68985 +int pciback_config_quirk_release(struct pci_dev *dev);
68986 +
68987 +int pciback_field_is_dup(struct pci_dev *dev, int reg);
68988 +
68989 +#endif
68990 diff -ruNp linux-2.6.19/drivers/xen/pciback/passthrough.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/passthrough.c
68991 --- linux-2.6.19/drivers/xen/pciback/passthrough.c      1970-01-01 00:00:00.000000000 +0000
68992 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/passthrough.c    2007-02-02 19:10:45.000000000 +0000
68993 @@ -0,0 +1,157 @@
68994 +/*
68995 + * PCI Backend - Provides restricted access to the real PCI bus topology
68996 + *               to the frontend
68997 + *
68998 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
68999 + */
69000 +
69001 +#include <linux/list.h>
69002 +#include <linux/pci.h>
69003 +#include <linux/spinlock.h>
69004 +#include "pciback.h"
69005 +
69006 +struct passthrough_dev_data {
69007 +       /* Access to dev_list must be protected by lock */
69008 +       struct list_head dev_list;
69009 +       spinlock_t lock;
69010 +};
69011 +
69012 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
69013 +                                   unsigned int domain, unsigned int bus,
69014 +                                   unsigned int devfn)
69015 +{
69016 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
69017 +       struct pci_dev_entry *dev_entry;
69018 +       struct pci_dev *dev = NULL;
69019 +       unsigned long flags;
69020 +
69021 +       spin_lock_irqsave(&dev_data->lock, flags);
69022 +
69023 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
69024 +               if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
69025 +                   && bus == (unsigned int)dev_entry->dev->bus->number
69026 +                   && devfn == dev_entry->dev->devfn) {
69027 +                       dev = dev_entry->dev;
69028 +                       break;
69029 +               }
69030 +       }
69031 +
69032 +       spin_unlock_irqrestore(&dev_data->lock, flags);
69033 +
69034 +       return dev;
69035 +}
69036 +
69037 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
69038 +{
69039 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
69040 +       struct pci_dev_entry *dev_entry;
69041 +       unsigned long flags;
69042 +
69043 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
69044 +       if (!dev_entry)
69045 +               return -ENOMEM;
69046 +       dev_entry->dev = dev;
69047 +
69048 +       spin_lock_irqsave(&dev_data->lock, flags);
69049 +       list_add_tail(&dev_entry->list, &dev_data->dev_list);
69050 +       spin_unlock_irqrestore(&dev_data->lock, flags);
69051 +
69052 +       return 0;
69053 +}
69054 +
69055 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
69056 +{
69057 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
69058 +       struct pci_dev_entry *dev_entry, *t;
69059 +       struct pci_dev *found_dev = NULL;
69060 +       unsigned long flags;
69061 +
69062 +       spin_lock_irqsave(&dev_data->lock, flags);
69063 +
69064 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
69065 +               if (dev_entry->dev == dev) {
69066 +                       list_del(&dev_entry->list);
69067 +                       found_dev = dev_entry->dev;
69068 +                       kfree(dev_entry);
69069 +               }
69070 +       }
69071 +
69072 +       spin_unlock_irqrestore(&dev_data->lock, flags);
69073 +
69074 +       if (found_dev)
69075 +               pcistub_put_pci_dev(found_dev);
69076 +}
69077 +
69078 +int pciback_init_devices(struct pciback_device *pdev)
69079 +{
69080 +       struct passthrough_dev_data *dev_data;
69081 +
69082 +       dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
69083 +       if (!dev_data)
69084 +               return -ENOMEM;
69085 +
69086 +       spin_lock_init(&dev_data->lock);
69087 +
69088 +       INIT_LIST_HEAD(&dev_data->dev_list);
69089 +
69090 +       pdev->pci_dev_data = dev_data;
69091 +
69092 +       return 0;
69093 +}
69094 +
69095 +int pciback_publish_pci_roots(struct pciback_device *pdev,
69096 +                             publish_pci_root_cb publish_root_cb)
69097 +{
69098 +       int err = 0;
69099 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
69100 +       struct pci_dev_entry *dev_entry, *e;
69101 +       struct pci_dev *dev;
69102 +       int found;
69103 +       unsigned int domain, bus;
69104 +
69105 +       spin_lock(&dev_data->lock);
69106 +
69107 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
69108 +               /* Only publish this device as a root if none of its
69109 +                * parent bridges are exported
69110 +                */
69111 +               found = 0;
69112 +               dev = dev_entry->dev->bus->self;
69113 +               for (; !found && dev != NULL; dev = dev->bus->self) {
69114 +                       list_for_each_entry(e, &dev_data->dev_list, list) {
69115 +                               if (dev == e->dev) {
69116 +                                       found = 1;
69117 +                                       break;
69118 +                               }
69119 +                       }
69120 +               }
69121 +
69122 +               domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
69123 +               bus = (unsigned int)dev_entry->dev->bus->number;
69124 +
69125 +               if (!found) {
69126 +                       err = publish_root_cb(pdev, domain, bus);
69127 +                       if (err)
69128 +                               break;
69129 +               }
69130 +       }
69131 +
69132 +       spin_unlock(&dev_data->lock);
69133 +
69134 +       return err;
69135 +}
69136 +
69137 +void pciback_release_devices(struct pciback_device *pdev)
69138 +{
69139 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
69140 +       struct pci_dev_entry *dev_entry, *t;
69141 +
69142 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
69143 +               list_del(&dev_entry->list);
69144 +               pcistub_put_pci_dev(dev_entry->dev);
69145 +               kfree(dev_entry);
69146 +       }
69147 +
69148 +       kfree(dev_data);
69149 +       pdev->pci_dev_data = NULL;
69150 +}
69151 diff -ruNp linux-2.6.19/drivers/xen/pciback/pci_stub.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/pci_stub.c
69152 --- linux-2.6.19/drivers/xen/pciback/pci_stub.c 1970-01-01 00:00:00.000000000 +0000
69153 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/pci_stub.c       2007-02-02 19:10:45.000000000 +0000
69154 @@ -0,0 +1,916 @@
69155 +/*
69156 + * PCI Stub Driver - Grabs devices in backend to be exported later
69157 + *
69158 + * Ryan Wilson <hap9@epoch.ncsc.mil>
69159 + * Chris Bookholt <hap10@epoch.ncsc.mil>
69160 + */
69161 +#include <linux/module.h>
69162 +#include <linux/init.h>
69163 +#include <linux/list.h>
69164 +#include <linux/spinlock.h>
69165 +#include <linux/kref.h>
69166 +#include <asm/atomic.h>
69167 +#include "pciback.h"
69168 +#include "conf_space.h"
69169 +#include "conf_space_quirks.h"
69170 +
69171 +static char *pci_devs_to_hide = NULL;
69172 +module_param_named(hide, pci_devs_to_hide, charp, 0444);
69173 +
69174 +struct pcistub_device_id {
69175 +       struct list_head slot_list;
69176 +       int domain;
69177 +       unsigned char bus;
69178 +       unsigned int devfn;
69179 +};
69180 +static LIST_HEAD(pcistub_device_ids);
69181 +static DEFINE_SPINLOCK(device_ids_lock);
69182 +
69183 +struct pcistub_device {
69184 +       struct kref kref;
69185 +       struct list_head dev_list;
69186 +       spinlock_t lock;
69187 +
69188 +       struct pci_dev *dev;
69189 +       struct pciback_device *pdev;    /* non-NULL if struct pci_dev is in use */
69190 +};
69191 +
69192 +/* Access to pcistub_devices & seized_devices lists and the initialize_devices
69193 + * flag must be locked with pcistub_devices_lock
69194 + */
69195 +static DEFINE_SPINLOCK(pcistub_devices_lock);
69196 +static LIST_HEAD(pcistub_devices);
69197 +
69198 +/* wait for device_initcall before initializing our devices
69199 + * (see pcistub_init_devices_late)
69200 + */
69201 +static int initialize_devices = 0;
69202 +static LIST_HEAD(seized_devices);
69203 +
69204 +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
69205 +{
69206 +       struct pcistub_device *psdev;
69207 +
69208 +       dev_dbg(&dev->dev, "pcistub_device_alloc\n");
69209 +
69210 +       psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
69211 +       if (!psdev)
69212 +               return NULL;
69213 +
69214 +       psdev->dev = pci_dev_get(dev);
69215 +       if (!psdev->dev) {
69216 +               kfree(psdev);
69217 +               return NULL;
69218 +       }
69219 +
69220 +       kref_init(&psdev->kref);
69221 +       spin_lock_init(&psdev->lock);
69222 +
69223 +       return psdev;
69224 +}
69225 +
69226 +/* Don't call this directly as it's called by pcistub_device_put */
69227 +static void pcistub_device_release(struct kref *kref)
69228 +{
69229 +       struct pcistub_device *psdev;
69230 +
69231 +       psdev = container_of(kref, struct pcistub_device, kref);
69232 +
69233 +       dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
69234 +
69235 +       /* Clean-up the device */
69236 +       pciback_reset_device(psdev->dev);
69237 +       pciback_config_free_dyn_fields(psdev->dev);
69238 +       pciback_config_free_dev(psdev->dev);
69239 +       kfree(pci_get_drvdata(psdev->dev));
69240 +       pci_set_drvdata(psdev->dev, NULL);
69241 +
69242 +       pci_dev_put(psdev->dev);
69243 +
69244 +       kfree(psdev);
69245 +}
69246 +
69247 +static inline void pcistub_device_get(struct pcistub_device *psdev)
69248 +{
69249 +       kref_get(&psdev->kref);
69250 +}
69251 +
69252 +static inline void pcistub_device_put(struct pcistub_device *psdev)
69253 +{
69254 +       kref_put(&psdev->kref, pcistub_device_release);
69255 +}
69256 +
69257 +static struct pcistub_device *pcistub_device_find(int domain, int bus,
69258 +                                                 int slot, int func)
69259 +{
69260 +       struct pcistub_device *psdev = NULL;
69261 +       unsigned long flags;
69262 +
69263 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69264 +
69265 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
69266 +               if (psdev->dev != NULL
69267 +                   && domain == pci_domain_nr(psdev->dev->bus)
69268 +                   && bus == psdev->dev->bus->number
69269 +                   && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
69270 +                       pcistub_device_get(psdev);
69271 +                       goto out;
69272 +               }
69273 +       }
69274 +
69275 +       /* didn't find it */
69276 +       psdev = NULL;
69277 +
69278 +      out:
69279 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69280 +       return psdev;
69281 +}
69282 +
69283 +static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
69284 +                                                 struct pcistub_device *psdev)
69285 +{
69286 +       struct pci_dev *pci_dev = NULL;
69287 +       unsigned long flags;
69288 +
69289 +       pcistub_device_get(psdev);
69290 +
69291 +       spin_lock_irqsave(&psdev->lock, flags);
69292 +       if (!psdev->pdev) {
69293 +               psdev->pdev = pdev;
69294 +               pci_dev = psdev->dev;
69295 +       }
69296 +       spin_unlock_irqrestore(&psdev->lock, flags);
69297 +
69298 +       if (!pci_dev)
69299 +               pcistub_device_put(psdev);
69300 +
69301 +       return pci_dev;
69302 +}
69303 +
69304 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
69305 +                                           int domain, int bus,
69306 +                                           int slot, int func)
69307 +{
69308 +       struct pcistub_device *psdev;
69309 +       struct pci_dev *found_dev = NULL;
69310 +       unsigned long flags;
69311 +
69312 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69313 +
69314 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
69315 +               if (psdev->dev != NULL
69316 +                   && domain == pci_domain_nr(psdev->dev->bus)
69317 +                   && bus == psdev->dev->bus->number
69318 +                   && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
69319 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
69320 +                       break;
69321 +               }
69322 +       }
69323 +
69324 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69325 +       return found_dev;
69326 +}
69327 +
69328 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
69329 +                                   struct pci_dev *dev)
69330 +{
69331 +       struct pcistub_device *psdev;
69332 +       struct pci_dev *found_dev = NULL;
69333 +       unsigned long flags;
69334 +
69335 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69336 +
69337 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
69338 +               if (psdev->dev == dev) {
69339 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
69340 +                       break;
69341 +               }
69342 +       }
69343 +
69344 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69345 +       return found_dev;
69346 +}
69347 +
69348 +void pcistub_put_pci_dev(struct pci_dev *dev)
69349 +{
69350 +       struct pcistub_device *psdev, *found_psdev = NULL;
69351 +       unsigned long flags;
69352 +
69353 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69354 +
69355 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
69356 +               if (psdev->dev == dev) {
69357 +                       found_psdev = psdev;
69358 +                       break;
69359 +               }
69360 +       }
69361 +
69362 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69363 +
69364 +       /* Cleanup our device
69365 +        * (so it's ready for the next domain)
69366 +        */
69367 +       pciback_reset_device(found_psdev->dev);
69368 +       pciback_config_free_dyn_fields(found_psdev->dev);
69369 +       pciback_config_reset_dev(found_psdev->dev);
69370 +
69371 +       spin_lock_irqsave(&found_psdev->lock, flags);
69372 +       found_psdev->pdev = NULL;
69373 +       spin_unlock_irqrestore(&found_psdev->lock, flags);
69374 +
69375 +       pcistub_device_put(found_psdev);
69376 +}
69377 +
69378 +static int __devinit pcistub_match_one(struct pci_dev *dev,
69379 +                                      struct pcistub_device_id *pdev_id)
69380 +{
69381 +       /* Match the specified device by domain, bus, slot, func and also if
69382 +        * any of the device's parent bridges match.
69383 +        */
69384 +       for (; dev != NULL; dev = dev->bus->self) {
69385 +               if (pci_domain_nr(dev->bus) == pdev_id->domain
69386 +                   && dev->bus->number == pdev_id->bus
69387 +                   && dev->devfn == pdev_id->devfn)
69388 +                       return 1;
69389 +
69390 +               /* Sometimes topmost bridge links to itself. */
69391 +               if (dev == dev->bus->self)
69392 +                       break;
69393 +       }
69394 +
69395 +       return 0;
69396 +}
69397 +
69398 +static int __devinit pcistub_match(struct pci_dev *dev)
69399 +{
69400 +       struct pcistub_device_id *pdev_id;
69401 +       unsigned long flags;
69402 +       int found = 0;
69403 +
69404 +       spin_lock_irqsave(&device_ids_lock, flags);
69405 +       list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
69406 +               if (pcistub_match_one(dev, pdev_id)) {
69407 +                       found = 1;
69408 +                       break;
69409 +               }
69410 +       }
69411 +       spin_unlock_irqrestore(&device_ids_lock, flags);
69412 +
69413 +       return found;
69414 +}
69415 +
69416 +static int __devinit pcistub_init_device(struct pci_dev *dev)
69417 +{
69418 +       struct pciback_dev_data *dev_data;
69419 +       int err = 0;
69420 +
69421 +       dev_dbg(&dev->dev, "initializing...\n");
69422 +
69423 +       /* The PCI backend is not intended to be a module (or to work with
69424 +        * removable PCI devices (yet). If it were, pciback_config_free()
69425 +        * would need to be called somewhere to free the memory allocated
69426 +        * here and then to call kfree(pci_get_drvdata(psdev->dev)).
69427 +        */
69428 +       dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
69429 +       if (!dev_data) {
69430 +               err = -ENOMEM;
69431 +               goto out;
69432 +       }
69433 +       pci_set_drvdata(dev, dev_data);
69434 +
69435 +       dev_dbg(&dev->dev, "initializing config\n");
69436 +       err = pciback_config_init_dev(dev);
69437 +       if (err)
69438 +               goto out;
69439 +
69440 +       /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
69441 +        * must do this here because pcibios_enable_device may specify
69442 +        * the pci device's true irq (and possibly its other resources)
69443 +        * if they differ from what's in the configuration space.
69444 +        * This makes the assumption that the device's resources won't
69445 +        * change after this point (otherwise this code may break!)
69446 +        */
69447 +       dev_dbg(&dev->dev, "enabling device\n");
69448 +       err = pci_enable_device(dev);
69449 +       if (err)
69450 +               goto config_release;
69451 +
69452 +       /* Now disable the device (this also ensures some private device
69453 +        * data is setup before we export)
69454 +        */
69455 +       dev_dbg(&dev->dev, "reset device\n");
69456 +       pciback_reset_device(dev);
69457 +
69458 +       return 0;
69459 +
69460 +      config_release:
69461 +       pciback_config_free_dev(dev);
69462 +
69463 +      out:
69464 +       pci_set_drvdata(dev, NULL);
69465 +       kfree(dev_data);
69466 +       return err;
69467 +}
69468 +
69469 +/*
69470 + * Because some initialization still happens on
69471 + * devices during fs_initcall, we need to defer
69472 + * full initialization of our devices until
69473 + * device_initcall.
69474 + */
69475 +static int __init pcistub_init_devices_late(void)
69476 +{
69477 +       struct pcistub_device *psdev;
69478 +       unsigned long flags;
69479 +       int err = 0;
69480 +
69481 +       pr_debug("pciback: pcistub_init_devices_late\n");
69482 +
69483 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69484 +
69485 +       while (!list_empty(&seized_devices)) {
69486 +               psdev = container_of(seized_devices.next,
69487 +                                    struct pcistub_device, dev_list);
69488 +               list_del(&psdev->dev_list);
69489 +
69490 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69491 +
69492 +               err = pcistub_init_device(psdev->dev);
69493 +               if (err) {
69494 +                       dev_err(&psdev->dev->dev,
69495 +                               "error %d initializing device\n", err);
69496 +                       kfree(psdev);
69497 +                       psdev = NULL;
69498 +               }
69499 +
69500 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
69501 +
69502 +               if (psdev)
69503 +                       list_add_tail(&psdev->dev_list, &pcistub_devices);
69504 +       }
69505 +
69506 +       initialize_devices = 1;
69507 +
69508 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69509 +
69510 +       return 0;
69511 +}
69512 +
69513 +static int __devinit pcistub_seize(struct pci_dev *dev)
69514 +{
69515 +       struct pcistub_device *psdev;
69516 +       unsigned long flags;
69517 +       int err = 0;
69518 +
69519 +       psdev = pcistub_device_alloc(dev);
69520 +       if (!psdev)
69521 +               return -ENOMEM;
69522 +
69523 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69524 +
69525 +       if (initialize_devices) {
69526 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69527 +
69528 +               /* don't want irqs disabled when calling pcistub_init_device */
69529 +               err = pcistub_init_device(psdev->dev);
69530 +
69531 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
69532 +
69533 +               if (!err)
69534 +                       list_add(&psdev->dev_list, &pcistub_devices);
69535 +       } else {
69536 +               dev_dbg(&dev->dev, "deferring initialization\n");
69537 +               list_add(&psdev->dev_list, &seized_devices);
69538 +       }
69539 +
69540 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69541 +
69542 +       if (err)
69543 +               pcistub_device_put(psdev);
69544 +
69545 +       return err;
69546 +}
69547 +
69548 +static int __devinit pcistub_probe(struct pci_dev *dev,
69549 +                                  const struct pci_device_id *id)
69550 +{
69551 +       int err = 0;
69552 +
69553 +       dev_dbg(&dev->dev, "probing...\n");
69554 +
69555 +       if (pcistub_match(dev)) {
69556 +
69557 +               if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
69558 +                   && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
69559 +                       dev_err(&dev->dev, "can't export pci devices that "
69560 +                               "don't have a normal (0) or bridge (1) "
69561 +                               "header type!\n");
69562 +                       err = -ENODEV;
69563 +                       goto out;
69564 +               }
69565 +
69566 +               dev_info(&dev->dev, "seizing device\n");
69567 +               err = pcistub_seize(dev);
69568 +       } else
69569 +               /* Didn't find the device */
69570 +               err = -ENODEV;
69571 +
69572 +      out:
69573 +       return err;
69574 +}
69575 +
69576 +static void pcistub_remove(struct pci_dev *dev)
69577 +{
69578 +       struct pcistub_device *psdev, *found_psdev = NULL;
69579 +       unsigned long flags;
69580 +
69581 +       dev_dbg(&dev->dev, "removing\n");
69582 +
69583 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69584 +
69585 +       pciback_config_quirk_release(dev);
69586 +
69587 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
69588 +               if (psdev->dev == dev) {
69589 +                       found_psdev = psdev;
69590 +                       break;
69591 +               }
69592 +       }
69593 +
69594 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69595 +
69596 +       if (found_psdev) {
69597 +               dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
69598 +                       found_psdev->pdev);
69599 +
69600 +               if (found_psdev->pdev) {
69601 +                       printk(KERN_WARNING "pciback: ****** removing device "
69602 +                              "%s while still in-use! ******\n",
69603 +                              pci_name(found_psdev->dev));
69604 +                       printk(KERN_WARNING "pciback: ****** driver domain may "
69605 +                              "still access this device's i/o resources!\n");
69606 +                       printk(KERN_WARNING "pciback: ****** shutdown driver "
69607 +                              "domain before binding device\n");
69608 +                       printk(KERN_WARNING "pciback: ****** to other drivers "
69609 +                              "or domains\n");
69610 +
69611 +                       pciback_release_pci_dev(found_psdev->pdev,
69612 +                                               found_psdev->dev);
69613 +               }
69614 +
69615 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
69616 +               list_del(&found_psdev->dev_list);
69617 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69618 +
69619 +               /* the final put for releasing from the list */
69620 +               pcistub_device_put(found_psdev);
69621 +       }
69622 +}
69623 +
69624 +static struct pci_device_id pcistub_ids[] = {
69625 +       {
69626 +        .vendor = PCI_ANY_ID,
69627 +        .device = PCI_ANY_ID,
69628 +        .subvendor = PCI_ANY_ID,
69629 +        .subdevice = PCI_ANY_ID,
69630 +        },
69631 +       {0,},
69632 +};
69633 +
69634 +/*
69635 + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
69636 + * for a normal device. I don't want it to be loaded automatically.
69637 + */
69638 +
69639 +static struct pci_driver pciback_pci_driver = {
69640 +       .name = "pciback",
69641 +       .id_table = pcistub_ids,
69642 +       .probe = pcistub_probe,
69643 +       .remove = pcistub_remove,
69644 +};
69645 +
69646 +static inline int str_to_slot(const char *buf, int *domain, int *bus,
69647 +                             int *slot, int *func)
69648 +{
69649 +       int err;
69650 +
69651 +       err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
69652 +       if (err == 4)
69653 +               return 0;
69654 +       else if (err < 0)
69655 +               return -EINVAL;
69656 +
69657 +       /* try again without domain */
69658 +       *domain = 0;
69659 +       err = sscanf(buf, " %x:%x.%x", bus, slot, func);
69660 +       if (err == 3)
69661 +               return 0;
69662 +
69663 +       return -EINVAL;
69664 +}
69665 +
69666 +static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
69667 +                              *slot, int *func, int *reg, int *size, int *mask)
69668 +{
69669 +       int err;
69670 +
69671 +       err =
69672 +           sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
69673 +                  func, reg, size, mask);
69674 +       if (err == 7)
69675 +               return 0;
69676 +       return -EINVAL;
69677 +}
69678 +
69679 +static int pcistub_device_id_add(int domain, int bus, int slot, int func)
69680 +{
69681 +       struct pcistub_device_id *pci_dev_id;
69682 +       unsigned long flags;
69683 +
69684 +       pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
69685 +       if (!pci_dev_id)
69686 +               return -ENOMEM;
69687 +
69688 +       pci_dev_id->domain = domain;
69689 +       pci_dev_id->bus = bus;
69690 +       pci_dev_id->devfn = PCI_DEVFN(slot, func);
69691 +
69692 +       pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
69693 +                domain, bus, slot, func);
69694 +
69695 +       spin_lock_irqsave(&device_ids_lock, flags);
69696 +       list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
69697 +       spin_unlock_irqrestore(&device_ids_lock, flags);
69698 +
69699 +       return 0;
69700 +}
69701 +
69702 +static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
69703 +{
69704 +       struct pcistub_device_id *pci_dev_id, *t;
69705 +       int devfn = PCI_DEVFN(slot, func);
69706 +       int err = -ENOENT;
69707 +       unsigned long flags;
69708 +
69709 +       spin_lock_irqsave(&device_ids_lock, flags);
69710 +       list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
69711 +
69712 +               if (pci_dev_id->domain == domain
69713 +                   && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
69714 +                       /* Don't break; here because it's possible the same
69715 +                        * slot could be in the list more than once
69716 +                        */
69717 +                       list_del(&pci_dev_id->slot_list);
69718 +                       kfree(pci_dev_id);
69719 +
69720 +                       err = 0;
69721 +
69722 +                       pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
69723 +                                "seize list\n", domain, bus, slot, func);
69724 +               }
69725 +       }
69726 +       spin_unlock_irqrestore(&device_ids_lock, flags);
69727 +
69728 +       return err;
69729 +}
69730 +
69731 +static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
69732 +                          int size, int mask)
69733 +{
69734 +       int err = 0;
69735 +       struct pcistub_device *psdev;
69736 +       struct pci_dev *dev;
69737 +       struct config_field *field;
69738 +
69739 +       psdev = pcistub_device_find(domain, bus, slot, func);
69740 +       if (!psdev || !psdev->dev) {
69741 +               err = -ENODEV;
69742 +               goto out;
69743 +       }
69744 +       dev = psdev->dev;
69745 +
69746 +       /* check for duplicate field */
69747 +       if (pciback_field_is_dup(dev, reg))
69748 +               goto out;
69749 +
69750 +       field = kzalloc(sizeof(*field), GFP_ATOMIC);
69751 +       if (!field) {
69752 +               err = -ENOMEM;
69753 +               goto out;
69754 +       }
69755 +
69756 +       field->offset = reg;
69757 +       field->size = size;
69758 +       field->mask = mask;
69759 +       field->init = NULL;
69760 +       field->reset = NULL;
69761 +       field->release = NULL;
69762 +       field->clean = pciback_config_field_free;
69763 +
69764 +       err = pciback_config_quirks_add_field(dev, field);
69765 +       if (err)
69766 +               kfree(field);
69767 +      out:
69768 +       return err;
69769 +}
69770 +
69771 +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
69772 +                               size_t count)
69773 +{
69774 +       int domain, bus, slot, func;
69775 +       int err;
69776 +
69777 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
69778 +       if (err)
69779 +               goto out;
69780 +
69781 +       err = pcistub_device_id_add(domain, bus, slot, func);
69782 +
69783 +      out:
69784 +       if (!err)
69785 +               err = count;
69786 +       return err;
69787 +}
69788 +
69789 +DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
69790 +
69791 +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
69792 +                                  size_t count)
69793 +{
69794 +       int domain, bus, slot, func;
69795 +       int err;
69796 +
69797 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
69798 +       if (err)
69799 +               goto out;
69800 +
69801 +       err = pcistub_device_id_remove(domain, bus, slot, func);
69802 +
69803 +      out:
69804 +       if (!err)
69805 +               err = count;
69806 +       return err;
69807 +}
69808 +
69809 +DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
69810 +
69811 +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
69812 +{
69813 +       struct pcistub_device_id *pci_dev_id;
69814 +       size_t count = 0;
69815 +       unsigned long flags;
69816 +
69817 +       spin_lock_irqsave(&device_ids_lock, flags);
69818 +       list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
69819 +               if (count >= PAGE_SIZE)
69820 +                       break;
69821 +
69822 +               count += scnprintf(buf + count, PAGE_SIZE - count,
69823 +                                  "%04x:%02x:%02x.%01x\n",
69824 +                                  pci_dev_id->domain, pci_dev_id->bus,
69825 +                                  PCI_SLOT(pci_dev_id->devfn),
69826 +                                  PCI_FUNC(pci_dev_id->devfn));
69827 +       }
69828 +       spin_unlock_irqrestore(&device_ids_lock, flags);
69829 +
69830 +       return count;
69831 +}
69832 +
69833 +DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
69834 +
69835 +static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
69836 +                                size_t count)
69837 +{
69838 +       int domain, bus, slot, func, reg, size, mask;
69839 +       int err;
69840 +
69841 +       err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
69842 +                          &mask);
69843 +       if (err)
69844 +               goto out;
69845 +
69846 +       err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
69847 +
69848 +      out:
69849 +       if (!err)
69850 +               err = count;
69851 +       return err;
69852 +}
69853 +
69854 +static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
69855 +{
69856 +       int count = 0;
69857 +       unsigned long flags;
69858 +       extern struct list_head pciback_quirks;
69859 +       struct pciback_config_quirk *quirk;
69860 +       struct pciback_dev_data *dev_data;
69861 +       struct config_field *field;
69862 +       struct config_field_entry *cfg_entry;
69863 +
69864 +       spin_lock_irqsave(&device_ids_lock, flags);
69865 +       list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
69866 +               if (count >= PAGE_SIZE)
69867 +                       goto out;
69868 +
69869 +               count += scnprintf(buf + count, PAGE_SIZE - count,
69870 +                                  "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
69871 +                                  quirk->pdev->bus->number,
69872 +                                  PCI_SLOT(quirk->pdev->devfn),
69873 +                                  PCI_FUNC(quirk->pdev->devfn),
69874 +                                  quirk->devid.vendor, quirk->devid.device,
69875 +                                  quirk->devid.subvendor,
69876 +                                  quirk->devid.subdevice);
69877 +
69878 +               dev_data = pci_get_drvdata(quirk->pdev);
69879 +
69880 +               list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
69881 +                       field = cfg_entry->field;
69882 +                       if (count >= PAGE_SIZE)
69883 +                               goto out;
69884 +
69885 +                       count += scnprintf(buf + count, PAGE_SIZE -
69886 +                                          count, "\t\t%08x:%01x:%08x\n",
69887 +                                          field->offset, field->size,
69888 +                                          field->mask);
69889 +               }
69890 +       }
69891 +
69892 +      out:
69893 +       spin_unlock_irqrestore(&device_ids_lock, flags);
69894 +
69895 +       return count;
69896 +}
69897 +
69898 +DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
69899 +
69900 +static ssize_t permissive_add(struct device_driver *drv, const char *buf,
69901 +                             size_t count)
69902 +{
69903 +       int domain, bus, slot, func;
69904 +       int err;
69905 +       struct pcistub_device *psdev;
69906 +       struct pciback_dev_data *dev_data;
69907 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
69908 +       if (err)
69909 +               goto out;
69910 +       psdev = pcistub_device_find(domain, bus, slot, func);
69911 +       if (!psdev) {
69912 +               err = -ENODEV;
69913 +               goto out;
69914 +       }
69915 +       if (!psdev->dev) {
69916 +               err = -ENODEV;
69917 +               goto release;
69918 +       }
69919 +       dev_data = pci_get_drvdata(psdev->dev);
69920 +       /* the driver data for a device should never be null at this point */
69921 +       if (!dev_data) {
69922 +               err = -ENXIO;
69923 +               goto release;
69924 +       }
69925 +       if (!dev_data->permissive) {
69926 +               dev_data->permissive = 1;
69927 +               /* Let user know that what they're doing could be unsafe */
69928 +               dev_warn(&psdev->dev->dev,
69929 +                        "enabling permissive mode configuration space accesses!\n");
69930 +               dev_warn(&psdev->dev->dev,
69931 +                        "permissive mode is potentially unsafe!\n");
69932 +       }
69933 +      release:
69934 +       pcistub_device_put(psdev);
69935 +      out:
69936 +       if (!err)
69937 +               err = count;
69938 +       return err;
69939 +}
69940 +
69941 +static ssize_t permissive_show(struct device_driver *drv, char *buf)
69942 +{
69943 +       struct pcistub_device *psdev;
69944 +       struct pciback_dev_data *dev_data;
69945 +       size_t count = 0;
69946 +       unsigned long flags;
69947 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
69948 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
69949 +               if (count >= PAGE_SIZE)
69950 +                       break;
69951 +               if (!psdev->dev)
69952 +                       continue;
69953 +               dev_data = pci_get_drvdata(psdev->dev);
69954 +               if (!dev_data || !dev_data->permissive)
69955 +                       continue;
69956 +               count +=
69957 +                   scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
69958 +                             pci_name(psdev->dev));
69959 +       }
69960 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
69961 +       return count;
69962 +}
69963 +
69964 +DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
69965 +
69966 +static int __init pcistub_init(void)
69967 +{
69968 +       int pos = 0;
69969 +       int err = 0;
69970 +       int domain, bus, slot, func;
69971 +       int parsed;
69972 +
69973 +       if (pci_devs_to_hide && *pci_devs_to_hide) {
69974 +               do {
69975 +                       parsed = 0;
69976 +
69977 +                       err = sscanf(pci_devs_to_hide + pos,
69978 +                                    " (%x:%x:%x.%x) %n",
69979 +                                    &domain, &bus, &slot, &func, &parsed);
69980 +                       if (err != 4) {
69981 +                               domain = 0;
69982 +                               err = sscanf(pci_devs_to_hide + pos,
69983 +                                            " (%x:%x.%x) %n",
69984 +                                            &bus, &slot, &func, &parsed);
69985 +                               if (err != 3)
69986 +                                       goto parse_error;
69987 +                       }
69988 +
69989 +                       err = pcistub_device_id_add(domain, bus, slot, func);
69990 +                       if (err)
69991 +                               goto out;
69992 +
69993 +                       /* if parsed<=0, we've reached the end of the string */
69994 +                       pos += parsed;
69995 +               } while (parsed > 0 && pci_devs_to_hide[pos]);
69996 +       }
69997 +
69998 +       /* If we're the first PCI Device Driver to register, we're the
69999 +        * first one to get offered PCI devices as they become
70000 +        * available (and thus we can be the first to grab them)
70001 +        */
70002 +       err = pci_register_driver(&pciback_pci_driver);
70003 +       if (err < 0)
70004 +               goto out;
70005 +
70006 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
70007 +       driver_create_file(&pciback_pci_driver.driver,
70008 +                          &driver_attr_remove_slot);
70009 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_slots);
70010 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_quirks);
70011 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_permissive);
70012 +
70013 +      out:
70014 +       return err;
70015 +
70016 +      parse_error:
70017 +       printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
70018 +              pci_devs_to_hide + pos);
70019 +       return -EINVAL;
70020 +}
70021 +
70022 +#ifndef MODULE
70023 +/*
70024 + * fs_initcall happens before device_initcall
70025 + * so pciback *should* get called first (b/c we 
70026 + * want to suck up any device before other drivers
70027 + * get a chance by being the first pci device
70028 + * driver to register)
70029 + */
70030 +fs_initcall(pcistub_init);
70031 +#endif
70032 +
70033 +static int __init pciback_init(void)
70034 +{
70035 +       int err;
70036 +
70037 +       err = pciback_config_init();
70038 +       if (err)
70039 +               return err;
70040 +
70041 +#ifdef MODULE
70042 +       err = pcistub_init();
70043 +       if (err < 0)
70044 +               return err;
70045 +#endif
70046 +
70047 +       pcistub_init_devices_late();
70048 +       pciback_xenbus_register();
70049 +
70050 +       return 0;
70051 +}
70052 +
70053 +static void __exit pciback_cleanup(void)
70054 +{
70055 +       pciback_xenbus_unregister();
70056 +
70057 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
70058 +       driver_remove_file(&pciback_pci_driver.driver,
70059 +                          &driver_attr_remove_slot);
70060 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
70061 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
70062 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
70063 +
70064 +       pci_unregister_driver(&pciback_pci_driver);
70065 +}
70066 +
70067 +module_init(pciback_init);
70068 +module_exit(pciback_cleanup);
70069 +
70070 +MODULE_LICENSE("Dual BSD/GPL");
70071 diff -ruNp linux-2.6.19/drivers/xen/pciback/pciback.h linux-2.6.19-xen-3.0.4/drivers/xen/pciback/pciback.h
70072 --- linux-2.6.19/drivers/xen/pciback/pciback.h  1970-01-01 00:00:00.000000000 +0000
70073 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/pciback.h        2007-02-02 19:10:45.000000000 +0000
70074 @@ -0,0 +1,93 @@
70075 +/*
70076 + * PCI Backend Common Data Structures & Function Declarations
70077 + *
70078 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
70079 + */
70080 +#ifndef __XEN_PCIBACK_H__
70081 +#define __XEN_PCIBACK_H__
70082 +
70083 +#include <linux/pci.h>
70084 +#include <linux/interrupt.h>
70085 +#include <xen/xenbus.h>
70086 +#include <linux/list.h>
70087 +#include <linux/spinlock.h>
70088 +#include <linux/workqueue.h>
70089 +#include <asm/atomic.h>
70090 +#include <xen/interface/io/pciif.h>
70091 +
70092 +struct pci_dev_entry {
70093 +       struct list_head list;
70094 +       struct pci_dev *dev;
70095 +};
70096 +
70097 +#define _PDEVF_op_active       (0)
70098 +#define PDEVF_op_active        (1<<(_PDEVF_op_active))
70099 +
70100 +struct pciback_device {
70101 +       void *pci_dev_data;
70102 +       spinlock_t dev_lock;
70103 +
70104 +       struct xenbus_device *xdev;
70105 +
70106 +       struct xenbus_watch be_watch;
70107 +       u8 be_watching;
70108 +
70109 +       int evtchn_irq;
70110 +
70111 +       struct vm_struct *sh_area;
70112 +       struct xen_pci_sharedinfo *sh_info;
70113 +
70114 +       unsigned long flags;
70115 +
70116 +       struct work_struct op_work;
70117 +};
70118 +
70119 +struct pciback_dev_data {
70120 +       struct list_head config_fields;
70121 +       int permissive;
70122 +       int warned_on_write;
70123 +};
70124 +
70125 +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
70126 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
70127 +                                           int domain, int bus,
70128 +                                           int slot, int func);
70129 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
70130 +                                   struct pci_dev *dev);
70131 +void pcistub_put_pci_dev(struct pci_dev *dev);
70132 +
70133 +/* Ensure a device is turned off or reset */
70134 +void pciback_reset_device(struct pci_dev *pdev);
70135 +
70136 +/* Access a virtual configuration space for a PCI device */
70137 +int pciback_config_init(void);
70138 +int pciback_config_init_dev(struct pci_dev *dev);
70139 +void pciback_config_free_dyn_fields(struct pci_dev *dev);
70140 +void pciback_config_reset_dev(struct pci_dev *dev);
70141 +void pciback_config_free_dev(struct pci_dev *dev);
70142 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
70143 +                       u32 * ret_val);
70144 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
70145 +
70146 +/* Handle requests for specific devices from the frontend */
70147 +typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
70148 +                                   unsigned int domain, unsigned int bus);
70149 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
70150 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
70151 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
70152 +                                   unsigned int domain, unsigned int bus,
70153 +                                   unsigned int devfn);
70154 +int pciback_init_devices(struct pciback_device *pdev);
70155 +int pciback_publish_pci_roots(struct pciback_device *pdev,
70156 +                             publish_pci_root_cb cb);
70157 +void pciback_release_devices(struct pciback_device *pdev);
70158 +
70159 +/* Handles events from front-end */
70160 +irqreturn_t pciback_handle_event(int irq, void *dev_id);
70161 +void pciback_do_op(void *data);
70162 +
70163 +int pciback_xenbus_register(void);
70164 +void pciback_xenbus_unregister(void);
70165 +
70166 +extern int verbose_request;
70167 +#endif
70168 diff -ruNp linux-2.6.19/drivers/xen/pciback/pciback_ops.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/pciback_ops.c
70169 --- linux-2.6.19/drivers/xen/pciback/pciback_ops.c      1970-01-01 00:00:00.000000000 +0000
70170 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/pciback_ops.c    2007-02-02 19:10:45.000000000 +0000
70171 @@ -0,0 +1,95 @@
70172 +/*
70173 + * PCI Backend Operations - respond to PCI requests from Frontend
70174 + *
70175 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
70176 + */
70177 +#include <linux/module.h>
70178 +#include <asm/bitops.h>
70179 +#include <xen/evtchn.h>
70180 +#include "pciback.h"
70181 +
70182 +int verbose_request = 0;
70183 +module_param(verbose_request, int, 0644);
70184 +
70185 +/* Ensure a device is "turned off" and ready to be exported.
70186 + * (Also see pciback_config_reset to ensure virtual configuration space is
70187 + * ready to be re-exported)
70188 + */
70189 +void pciback_reset_device(struct pci_dev *dev)
70190 +{
70191 +       u16 cmd;
70192 +
70193 +       /* Disable devices (but not bridges) */
70194 +       if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
70195 +               pci_disable_device(dev);
70196 +
70197 +               pci_write_config_word(dev, PCI_COMMAND, 0);
70198 +
70199 +               dev->is_enabled = 0;
70200 +               dev->is_busmaster = 0;
70201 +       } else {
70202 +               pci_read_config_word(dev, PCI_COMMAND, &cmd);
70203 +               if (cmd & (PCI_COMMAND_INVALIDATE)) {
70204 +                       cmd &= ~(PCI_COMMAND_INVALIDATE);
70205 +                       pci_write_config_word(dev, PCI_COMMAND, cmd);
70206 +
70207 +                       dev->is_busmaster = 0;
70208 +               }
70209 +       }
70210 +}
70211 +
70212 +static inline void test_and_schedule_op(struct pciback_device *pdev)
70213 +{
70214 +       /* Check that frontend is requesting an operation and that we are not
70215 +        * already processing a request */
70216 +       if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
70217 +           && !test_and_set_bit(_PDEVF_op_active, &pdev->flags))
70218 +               schedule_work(&pdev->op_work);
70219 +}
70220 +
70221 +/* Performing the configuration space reads/writes must not be done in atomic
70222 + * context because some of the pci_* functions can sleep (mostly due to ACPI
70223 + * use of semaphores). This function is intended to be called from a work
70224 + * queue in process context taking a struct pciback_device as a parameter */
70225 +void pciback_do_op(void *data)
70226 +{
70227 +       struct pciback_device *pdev = data;
70228 +       struct pci_dev *dev;
70229 +       struct xen_pci_op *op = &pdev->sh_info->op;
70230 +
70231 +       dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
70232 +
70233 +       if (dev == NULL)
70234 +               op->err = XEN_PCI_ERR_dev_not_found;
70235 +       else if (op->cmd == XEN_PCI_OP_conf_read)
70236 +               op->err = pciback_config_read(dev, op->offset, op->size,
70237 +                                             &op->value);
70238 +       else if (op->cmd == XEN_PCI_OP_conf_write)
70239 +               op->err = pciback_config_write(dev, op->offset, op->size,
70240 +                                              op->value);
70241 +       else
70242 +               op->err = XEN_PCI_ERR_not_implemented;
70243 +
70244 +       /* Tell the driver domain that we're done. */ 
70245 +       wmb();
70246 +       clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
70247 +       notify_remote_via_irq(pdev->evtchn_irq);
70248 +
70249 +       /* Mark that we're done. */
70250 +       smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
70251 +       clear_bit(_PDEVF_op_active, &pdev->flags);
70252 +       smp_mb__after_clear_bit(); /* /before/ final check for work */
70253 +
70254 +       /* Check to see if the driver domain tried to start another request in
70255 +        * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. */
70256 +       test_and_schedule_op(pdev);
70257 +}
70258 +
70259 +irqreturn_t pciback_handle_event(int irq, void *dev_id)
70260 +{
70261 +       struct pciback_device *pdev = dev_id;
70262 +
70263 +       test_and_schedule_op(pdev);
70264 +
70265 +       return IRQ_HANDLED;
70266 +}
70267 diff -ruNp linux-2.6.19/drivers/xen/pciback/slot.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/slot.c
70268 --- linux-2.6.19/drivers/xen/pciback/slot.c     1970-01-01 00:00:00.000000000 +0000
70269 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/slot.c   2007-02-02 19:10:45.000000000 +0000
70270 @@ -0,0 +1,151 @@
70271 +/*
70272 + * PCI Backend - Provides a Virtual PCI bus (with real devices)
70273 + *               to the frontend
70274 + *
70275 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
70276 + *   Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
70277 + */
70278 +
70279 +#include <linux/list.h>
70280 +#include <linux/slab.h>
70281 +#include <linux/pci.h>
70282 +#include <linux/spinlock.h>
70283 +#include "pciback.h"
70284 +
70285 +/* There are at most 32 slots in a pci bus.  */
70286 +#define PCI_SLOT_MAX 32
70287 +
70288 +#define PCI_BUS_NBR 2
70289 +
70290 +struct slot_dev_data {
70291 +       /* Access to dev_list must be protected by lock */
70292 +       struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
70293 +       spinlock_t lock;
70294 +};
70295 +
70296 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
70297 +                                   unsigned int domain, unsigned int bus,
70298 +                                   unsigned int devfn)
70299 +{
70300 +       struct pci_dev *dev = NULL;
70301 +       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
70302 +       unsigned long flags;
70303 +
70304 +       if (domain != 0 || PCI_FUNC(devfn) != 0)
70305 +               return NULL;
70306 +
70307 +       if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
70308 +               return NULL;
70309 +
70310 +       spin_lock_irqsave(&slot_dev->lock, flags);
70311 +       dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
70312 +       spin_unlock_irqrestore(&slot_dev->lock, flags);
70313 +
70314 +       return dev;
70315 +}
70316 +
70317 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
70318 +{
70319 +       int err = 0, slot, bus;
70320 +       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
70321 +       unsigned long flags;
70322 +
70323 +       if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
70324 +               err = -EFAULT;
70325 +               xenbus_dev_fatal(pdev->xdev, err,
70326 +                                "Can't export bridges on the virtual PCI bus");
70327 +               goto out;
70328 +       }
70329 +
70330 +       spin_lock_irqsave(&slot_dev->lock, flags);
70331 +
70332 +       /* Assign to a new slot on the virtual PCI bus */
70333 +       for (bus = 0; bus < PCI_BUS_NBR; bus++)
70334 +               for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70335 +                       if (slot_dev->slots[bus][slot] == NULL) {
70336 +                               printk(KERN_INFO
70337 +                                      "pciback: slot: %s: assign to virtual slot %d, bus %d\n",
70338 +                                      pci_name(dev), slot, bus);
70339 +                               slot_dev->slots[bus][slot] = dev;
70340 +                               goto unlock;
70341 +                       }
70342 +               }
70343 +
70344 +       err = -ENOMEM;
70345 +       xenbus_dev_fatal(pdev->xdev, err,
70346 +                        "No more space on root virtual PCI bus");
70347 +
70348 +      unlock:
70349 +       spin_unlock_irqrestore(&slot_dev->lock, flags);
70350 +      out:
70351 +       return err;
70352 +}
70353 +
70354 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
70355 +{
70356 +       int slot, bus;
70357 +       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
70358 +       struct pci_dev *found_dev = NULL;
70359 +       unsigned long flags;
70360 +
70361 +       spin_lock_irqsave(&slot_dev->lock, flags);
70362 +
70363 +       for (bus = 0; bus < PCI_BUS_NBR; bus++)
70364 +               for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70365 +                       if (slot_dev->slots[bus][slot] == dev) {
70366 +                               slot_dev->slots[bus][slot] = NULL;
70367 +                               found_dev = dev;
70368 +                               goto out;
70369 +                       }
70370 +               }
70371 +
70372 +      out:
70373 +       spin_unlock_irqrestore(&slot_dev->lock, flags);
70374 +
70375 +       if (found_dev)
70376 +               pcistub_put_pci_dev(found_dev);
70377 +}
70378 +
70379 +int pciback_init_devices(struct pciback_device *pdev)
70380 +{
70381 +       int slot, bus;
70382 +       struct slot_dev_data *slot_dev;
70383 +
70384 +       slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
70385 +       if (!slot_dev)
70386 +               return -ENOMEM;
70387 +
70388 +       spin_lock_init(&slot_dev->lock);
70389 +
70390 +       for (bus = 0; bus < PCI_BUS_NBR; bus++)
70391 +               for (slot = 0; slot < PCI_SLOT_MAX; slot++)
70392 +                       slot_dev->slots[bus][slot] = NULL;
70393 +
70394 +       pdev->pci_dev_data = slot_dev;
70395 +
70396 +       return 0;
70397 +}
70398 +
70399 +int pciback_publish_pci_roots(struct pciback_device *pdev,
70400 +                             publish_pci_root_cb publish_cb)
70401 +{
70402 +       /* The Virtual PCI bus has only one root */
70403 +       return publish_cb(pdev, 0, 0);
70404 +}
70405 +
70406 +void pciback_release_devices(struct pciback_device *pdev)
70407 +{
70408 +       int slot, bus;
70409 +       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
70410 +       struct pci_dev *dev;
70411 +
70412 +       for (bus = 0; bus < PCI_BUS_NBR; bus++)
70413 +               for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70414 +                       dev = slot_dev->slots[bus][slot];
70415 +                       if (dev != NULL)
70416 +                               pcistub_put_pci_dev(dev);
70417 +               }
70418 +
70419 +       kfree(slot_dev);
70420 +       pdev->pci_dev_data = NULL;
70421 +}
70422 diff -ruNp linux-2.6.19/drivers/xen/pciback/vpci.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/vpci.c
70423 --- linux-2.6.19/drivers/xen/pciback/vpci.c     1970-01-01 00:00:00.000000000 +0000
70424 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/vpci.c   2007-02-02 19:10:45.000000000 +0000
70425 @@ -0,0 +1,204 @@
70426 +/*
70427 + * PCI Backend - Provides a Virtual PCI bus (with real devices)
70428 + *               to the frontend
70429 + *
70430 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
70431 + */
70432 +
70433 +#include <linux/list.h>
70434 +#include <linux/slab.h>
70435 +#include <linux/pci.h>
70436 +#include <linux/spinlock.h>
70437 +#include "pciback.h"
70438 +
70439 +#define PCI_SLOT_MAX 32
70440 +
70441 +struct vpci_dev_data {
70442 +       /* Access to dev_list must be protected by lock */
70443 +       struct list_head dev_list[PCI_SLOT_MAX];
70444 +       spinlock_t lock;
70445 +};
70446 +
70447 +static inline struct list_head *list_first(struct list_head *head)
70448 +{
70449 +       return head->next;
70450 +}
70451 +
70452 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
70453 +                                   unsigned int domain, unsigned int bus,
70454 +                                   unsigned int devfn)
70455 +{
70456 +       struct pci_dev_entry *entry;
70457 +       struct pci_dev *dev = NULL;
70458 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
70459 +       unsigned long flags;
70460 +
70461 +       if (domain != 0 || bus != 0)
70462 +               return NULL;
70463 +
70464 +       if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
70465 +               spin_lock_irqsave(&vpci_dev->lock, flags);
70466 +
70467 +               list_for_each_entry(entry,
70468 +                                   &vpci_dev->dev_list[PCI_SLOT(devfn)],
70469 +                                   list) {
70470 +                       if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
70471 +                               dev = entry->dev;
70472 +                               break;
70473 +                       }
70474 +               }
70475 +
70476 +               spin_unlock_irqrestore(&vpci_dev->lock, flags);
70477 +       }
70478 +       return dev;
70479 +}
70480 +
70481 +static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
70482 +{
70483 +       if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
70484 +           && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
70485 +               return 1;
70486 +
70487 +       return 0;
70488 +}
70489 +
70490 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
70491 +{
70492 +       int err = 0, slot;
70493 +       struct pci_dev_entry *t, *dev_entry;
70494 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
70495 +       unsigned long flags;
70496 +
70497 +       if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
70498 +               err = -EFAULT;
70499 +               xenbus_dev_fatal(pdev->xdev, err,
70500 +                                "Can't export bridges on the virtual PCI bus");
70501 +               goto out;
70502 +       }
70503 +
70504 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
70505 +       if (!dev_entry) {
70506 +               err = -ENOMEM;
70507 +               xenbus_dev_fatal(pdev->xdev, err,
70508 +                                "Error adding entry to virtual PCI bus");
70509 +               goto out;
70510 +       }
70511 +
70512 +       dev_entry->dev = dev;
70513 +
70514 +       spin_lock_irqsave(&vpci_dev->lock, flags);
70515 +
70516 +       /* Keep multi-function devices together on the virtual PCI bus */
70517 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70518 +               if (!list_empty(&vpci_dev->dev_list[slot])) {
70519 +                       t = list_entry(list_first(&vpci_dev->dev_list[slot]),
70520 +                                      struct pci_dev_entry, list);
70521 +
70522 +                       if (match_slot(dev, t->dev)) {
70523 +                               pr_info("pciback: vpci: %s: "
70524 +                                       "assign to virtual slot %d func %d\n",
70525 +                                       pci_name(dev), slot,
70526 +                                       PCI_FUNC(dev->devfn));
70527 +                               list_add_tail(&dev_entry->list,
70528 +                                             &vpci_dev->dev_list[slot]);
70529 +                               goto unlock;
70530 +                       }
70531 +               }
70532 +       }
70533 +
70534 +       /* Assign to a new slot on the virtual PCI bus */
70535 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70536 +               if (list_empty(&vpci_dev->dev_list[slot])) {
70537 +                       printk(KERN_INFO
70538 +                              "pciback: vpci: %s: assign to virtual slot %d\n",
70539 +                              pci_name(dev), slot);
70540 +                       list_add_tail(&dev_entry->list,
70541 +                                     &vpci_dev->dev_list[slot]);
70542 +                       goto unlock;
70543 +               }
70544 +       }
70545 +
70546 +       err = -ENOMEM;
70547 +       xenbus_dev_fatal(pdev->xdev, err,
70548 +                        "No more space on root virtual PCI bus");
70549 +
70550 +      unlock:
70551 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
70552 +      out:
70553 +       return err;
70554 +}
70555 +
70556 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
70557 +{
70558 +       int slot;
70559 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
70560 +       struct pci_dev *found_dev = NULL;
70561 +       unsigned long flags;
70562 +
70563 +       spin_lock_irqsave(&vpci_dev->lock, flags);
70564 +
70565 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70566 +               struct pci_dev_entry *e, *tmp;
70567 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
70568 +                                        list) {
70569 +                       if (e->dev == dev) {
70570 +                               list_del(&e->list);
70571 +                               found_dev = e->dev;
70572 +                               kfree(e);
70573 +                               goto out;
70574 +                       }
70575 +               }
70576 +       }
70577 +
70578 +      out:
70579 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
70580 +
70581 +       if (found_dev)
70582 +               pcistub_put_pci_dev(found_dev);
70583 +}
70584 +
70585 +int pciback_init_devices(struct pciback_device *pdev)
70586 +{
70587 +       int slot;
70588 +       struct vpci_dev_data *vpci_dev;
70589 +
70590 +       vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
70591 +       if (!vpci_dev)
70592 +               return -ENOMEM;
70593 +
70594 +       spin_lock_init(&vpci_dev->lock);
70595 +
70596 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70597 +               INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
70598 +       }
70599 +
70600 +       pdev->pci_dev_data = vpci_dev;
70601 +
70602 +       return 0;
70603 +}
70604 +
70605 +int pciback_publish_pci_roots(struct pciback_device *pdev,
70606 +                             publish_pci_root_cb publish_cb)
70607 +{
70608 +       /* The Virtual PCI bus has only one root */
70609 +       return publish_cb(pdev, 0, 0);
70610 +}
70611 +
70612 +void pciback_release_devices(struct pciback_device *pdev)
70613 +{
70614 +       int slot;
70615 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
70616 +
70617 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
70618 +               struct pci_dev_entry *e, *tmp;
70619 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
70620 +                                        list) {
70621 +                       list_del(&e->list);
70622 +                       pcistub_put_pci_dev(e->dev);
70623 +                       kfree(e);
70624 +               }
70625 +       }
70626 +
70627 +       kfree(vpci_dev);
70628 +       pdev->pci_dev_data = NULL;
70629 +}
70630 diff -ruNp linux-2.6.19/drivers/xen/pciback/xenbus.c linux-2.6.19-xen-3.0.4/drivers/xen/pciback/xenbus.c
70631 --- linux-2.6.19/drivers/xen/pciback/xenbus.c   1970-01-01 00:00:00.000000000 +0000
70632 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pciback/xenbus.c 2007-02-02 19:10:45.000000000 +0000
70633 @@ -0,0 +1,458 @@
70634 +/*
70635 + * PCI Backend Xenbus Setup - handles setup with frontend and xend
70636 + *
70637 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
70638 + */
70639 +#include <linux/module.h>
70640 +#include <linux/init.h>
70641 +#include <linux/list.h>
70642 +#include <linux/vmalloc.h>
70643 +#include <xen/xenbus.h>
70644 +#include <xen/evtchn.h>
70645 +#include "pciback.h"
70646 +
70647 +#define INVALID_EVTCHN_IRQ  (-1)
70648 +
70649 +static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
70650 +{
70651 +       struct pciback_device *pdev;
70652 +
70653 +       pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
70654 +       if (pdev == NULL)
70655 +               goto out;
70656 +       dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
70657 +
70658 +       pdev->xdev = xdev;
70659 +       xdev->dev.driver_data = pdev;
70660 +
70661 +       spin_lock_init(&pdev->dev_lock);
70662 +
70663 +       pdev->sh_area = NULL;
70664 +       pdev->sh_info = NULL;
70665 +       pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
70666 +       pdev->be_watching = 0;
70667 +
70668 +       INIT_WORK(&pdev->op_work, pciback_do_op, pdev);
70669 +
70670 +       if (pciback_init_devices(pdev)) {
70671 +               kfree(pdev);
70672 +               pdev = NULL;
70673 +       }
70674 +      out:
70675 +       return pdev;
70676 +}
70677 +
70678 +static void free_pdev(struct pciback_device *pdev)
70679 +{
70680 +       if (pdev->be_watching)
70681 +               unregister_xenbus_watch(&pdev->be_watch);
70682 +
70683 +       /* Ensure the guest can't trigger our handler before removing devices */
70684 +       if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ)
70685 +               unbind_from_irqhandler(pdev->evtchn_irq, pdev);
70686 +
70687 +       /* If the driver domain started an op, make sure we complete it or
70688 +        * delete it before releasing the shared memory */
70689 +       cancel_delayed_work(&pdev->op_work);
70690 +       flush_scheduled_work();
70691 +
70692 +       if (pdev->sh_info)
70693 +               xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
70694 +
70695 +       pciback_release_devices(pdev);
70696 +
70697 +       pdev->xdev->dev.driver_data = NULL;
70698 +       pdev->xdev = NULL;
70699 +
70700 +       kfree(pdev);
70701 +}
70702 +
70703 +static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
70704 +                            int remote_evtchn)
70705 +{
70706 +       int err = 0;
70707 +       int evtchn;
70708 +       struct vm_struct *area;
70709 +
70710 +       dev_dbg(&pdev->xdev->dev,
70711 +               "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
70712 +               gnt_ref, remote_evtchn);
70713 +
70714 +       area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
70715 +       if (IS_ERR(area)) {
70716 +               err = PTR_ERR(area);
70717 +               goto out;
70718 +       }
70719 +       pdev->sh_area = area;
70720 +       pdev->sh_info = area->addr;
70721 +
70722 +       err = xenbus_bind_evtchn(pdev->xdev, remote_evtchn, &evtchn);
70723 +       if (err)
70724 +               goto out;
70725 +
70726 +       err = bind_evtchn_to_irqhandler(evtchn, pciback_handle_event,
70727 +                                       SA_SAMPLE_RANDOM, "pciback", pdev);
70728 +       if (err < 0) {
70729 +               xenbus_dev_fatal(pdev->xdev, err,
70730 +                                "Error binding event channel to IRQ");
70731 +               goto out;
70732 +       }
70733 +       pdev->evtchn_irq = err;
70734 +       err = 0;
70735 +
70736 +       dev_dbg(&pdev->xdev->dev, "Attached!\n");
70737 +      out:
70738 +       return err;
70739 +}
70740 +
70741 +static int pciback_attach(struct pciback_device *pdev)
70742 +{
70743 +       int err = 0;
70744 +       int gnt_ref, remote_evtchn;
70745 +       char *magic = NULL;
70746 +
70747 +       spin_lock(&pdev->dev_lock);
70748 +
70749 +       /* Make sure we only do this setup once */
70750 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
70751 +           XenbusStateInitialised)
70752 +               goto out;
70753 +
70754 +       /* Wait for frontend to state that it has published the configuration */
70755 +       if (xenbus_read_driver_state(pdev->xdev->otherend) !=
70756 +           XenbusStateInitialised)
70757 +               goto out;
70758 +
70759 +       dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
70760 +
70761 +       err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
70762 +                           "pci-op-ref", "%u", &gnt_ref,
70763 +                           "event-channel", "%u", &remote_evtchn,
70764 +                           "magic", NULL, &magic, NULL);
70765 +       if (err) {
70766 +               /* If configuration didn't get read correctly, wait longer */
70767 +               xenbus_dev_fatal(pdev->xdev, err,
70768 +                                "Error reading configuration from frontend");
70769 +               goto out;
70770 +       }
70771 +
70772 +       if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
70773 +               xenbus_dev_fatal(pdev->xdev, -EFAULT,
70774 +                                "version mismatch (%s/%s) with pcifront - "
70775 +                                "halting pciback",
70776 +                                magic, XEN_PCI_MAGIC);
70777 +               goto out;
70778 +       }
70779 +
70780 +       err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
70781 +       if (err)
70782 +               goto out;
70783 +
70784 +       dev_dbg(&pdev->xdev->dev, "Connecting...\n");
70785 +
70786 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
70787 +       if (err)
70788 +               xenbus_dev_fatal(pdev->xdev, err,
70789 +                                "Error switching to connected state!");
70790 +
70791 +       dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
70792 +      out:
70793 +       spin_unlock(&pdev->dev_lock);
70794 +
70795 +       if (magic)
70796 +               kfree(magic);
70797 +
70798 +       return err;
70799 +}
70800 +
70801 +static void pciback_frontend_changed(struct xenbus_device *xdev,
70802 +                                    enum xenbus_state fe_state)
70803 +{
70804 +       struct pciback_device *pdev = xdev->dev.driver_data;
70805 +
70806 +       dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
70807 +
70808 +       switch (fe_state) {
70809 +       case XenbusStateInitialised:
70810 +               pciback_attach(pdev);
70811 +               break;
70812 +
70813 +       case XenbusStateClosing:
70814 +               xenbus_switch_state(xdev, XenbusStateClosing);
70815 +               break;
70816 +
70817 +       case XenbusStateUnknown:
70818 +       case XenbusStateClosed:
70819 +               dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
70820 +               device_unregister(&xdev->dev);
70821 +               break;
70822 +
70823 +       default:
70824 +               break;
70825 +       }
70826 +}
70827 +
70828 +static int pciback_publish_pci_root(struct pciback_device *pdev,
70829 +                                   unsigned int domain, unsigned int bus)
70830 +{
70831 +       unsigned int d, b;
70832 +       int i, root_num, len, err;
70833 +       char str[64];
70834 +
70835 +       dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
70836 +
70837 +       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
70838 +                          "root_num", "%d", &root_num);
70839 +       if (err == 0 || err == -ENOENT)
70840 +               root_num = 0;
70841 +       else if (err < 0)
70842 +               goto out;
70843 +
70844 +       /* Verify that we haven't already published this pci root */
70845 +       for (i = 0; i < root_num; i++) {
70846 +               len = snprintf(str, sizeof(str), "root-%d", i);
70847 +               if (unlikely(len >= (sizeof(str) - 1))) {
70848 +                       err = -ENOMEM;
70849 +                       goto out;
70850 +               }
70851 +
70852 +               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
70853 +                                  str, "%x:%x", &d, &b);
70854 +               if (err < 0)
70855 +                       goto out;
70856 +               if (err != 2) {
70857 +                       err = -EINVAL;
70858 +                       goto out;
70859 +               }
70860 +
70861 +               if (d == domain && b == bus) {
70862 +                       err = 0;
70863 +                       goto out;
70864 +               }
70865 +       }
70866 +
70867 +       len = snprintf(str, sizeof(str), "root-%d", root_num);
70868 +       if (unlikely(len >= (sizeof(str) - 1))) {
70869 +               err = -ENOMEM;
70870 +               goto out;
70871 +       }
70872 +
70873 +       dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
70874 +               root_num, domain, bus);
70875 +
70876 +       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
70877 +                           "%04x:%02x", domain, bus);
70878 +       if (err)
70879 +               goto out;
70880 +
70881 +       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
70882 +                           "root_num", "%d", (root_num + 1));
70883 +
70884 +      out:
70885 +       return err;
70886 +}
70887 +
70888 +static int pciback_export_device(struct pciback_device *pdev,
70889 +                                int domain, int bus, int slot, int func)
70890 +{
70891 +       struct pci_dev *dev;
70892 +       int err = 0;
70893 +
70894 +       dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
70895 +               domain, bus, slot, func);
70896 +
70897 +       dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
70898 +       if (!dev) {
70899 +               err = -EINVAL;
70900 +               xenbus_dev_fatal(pdev->xdev, err,
70901 +                                "Couldn't locate PCI device "
70902 +                                "(%04x:%02x:%02x.%01x)! "
70903 +                                "perhaps already in-use?",
70904 +                                domain, bus, slot, func);
70905 +               goto out;
70906 +       }
70907 +
70908 +       err = pciback_add_pci_dev(pdev, dev);
70909 +       if (err)
70910 +               goto out;
70911 +
70912 +       /* TODO: It'd be nice to export a bridge and have all of its children
70913 +        * get exported with it. This may be best done in xend (which will
70914 +        * have to calculate resource usage anyway) but we probably want to
70915 +        * put something in here to ensure that if a bridge gets given to a
70916 +        * driver domain, that all devices under that bridge are not given
70917 +        * to other driver domains (as he who controls the bridge can disable
70918 +        * it and stop the other devices from working).
70919 +        */
70920 +      out:
70921 +       return err;
70922 +}
70923 +
70924 +static int pciback_setup_backend(struct pciback_device *pdev)
70925 +{
70926 +       /* Get configuration from xend (if available now) */
70927 +       int domain, bus, slot, func;
70928 +       int err = 0;
70929 +       int i, num_devs;
70930 +       char dev_str[64];
70931 +
70932 +       spin_lock(&pdev->dev_lock);
70933 +
70934 +       /* It's possible we could get the call to setup twice, so make sure
70935 +        * we're not already connected.
70936 +        */
70937 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
70938 +           XenbusStateInitWait)
70939 +               goto out;
70940 +
70941 +       dev_dbg(&pdev->xdev->dev, "getting be setup\n");
70942 +
70943 +       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
70944 +                          &num_devs);
70945 +       if (err != 1) {
70946 +               if (err >= 0)
70947 +                       err = -EINVAL;
70948 +               xenbus_dev_fatal(pdev->xdev, err,
70949 +                                "Error reading number of devices");
70950 +               goto out;
70951 +       }
70952 +
70953 +       for (i = 0; i < num_devs; i++) {
70954 +               int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
70955 +               if (unlikely(l >= (sizeof(dev_str) - 1))) {
70956 +                       err = -ENOMEM;
70957 +                       xenbus_dev_fatal(pdev->xdev, err,
70958 +                                        "String overflow while reading "
70959 +                                        "configuration");
70960 +                       goto out;
70961 +               }
70962 +
70963 +               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
70964 +                                  "%x:%x:%x.%x", &domain, &bus, &slot, &func);
70965 +               if (err < 0) {
70966 +                       xenbus_dev_fatal(pdev->xdev, err,
70967 +                                        "Error reading device configuration");
70968 +                       goto out;
70969 +               }
70970 +               if (err != 4) {
70971 +                       err = -EINVAL;
70972 +                       xenbus_dev_fatal(pdev->xdev, err,
70973 +                                        "Error parsing pci device "
70974 +                                        "configuration");
70975 +                       goto out;
70976 +               }
70977 +
70978 +               err = pciback_export_device(pdev, domain, bus, slot, func);
70979 +               if (err)
70980 +                       goto out;
70981 +       }
70982 +
70983 +       err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
70984 +       if (err) {
70985 +               xenbus_dev_fatal(pdev->xdev, err,
70986 +                                "Error while publish PCI root buses "
70987 +                                "for frontend");
70988 +               goto out;
70989 +       }
70990 +
70991 +       err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
70992 +       if (err)
70993 +               xenbus_dev_fatal(pdev->xdev, err,
70994 +                                "Error switching to initialised state!");
70995 +
70996 +      out:
70997 +       spin_unlock(&pdev->dev_lock);
70998 +
70999 +       if (!err)
71000 +               /* see if pcifront is already configured (if not, we'll wait) */
71001 +               pciback_attach(pdev);
71002 +
71003 +       return err;
71004 +}
71005 +
71006 +static void pciback_be_watch(struct xenbus_watch *watch,
71007 +                            const char **vec, unsigned int len)
71008 +{
71009 +       struct pciback_device *pdev =
71010 +           container_of(watch, struct pciback_device, be_watch);
71011 +
71012 +       switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
71013 +       case XenbusStateInitWait:
71014 +               pciback_setup_backend(pdev);
71015 +               break;
71016 +
71017 +       default:
71018 +               break;
71019 +       }
71020 +}
71021 +
71022 +static int pciback_xenbus_probe(struct xenbus_device *dev,
71023 +                               const struct xenbus_device_id *id)
71024 +{
71025 +       int err = 0;
71026 +       struct pciback_device *pdev = alloc_pdev(dev);
71027 +
71028 +       if (pdev == NULL) {
71029 +               err = -ENOMEM;
71030 +               xenbus_dev_fatal(dev, err,
71031 +                                "Error allocating pciback_device struct");
71032 +               goto out;
71033 +       }
71034 +
71035 +       /* wait for xend to configure us */
71036 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
71037 +       if (err)
71038 +               goto out;
71039 +
71040 +       /* watch the backend node for backend configuration information */
71041 +       err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
71042 +                               pciback_be_watch);
71043 +       if (err)
71044 +               goto out;
71045 +       pdev->be_watching = 1;
71046 +
71047 +       /* We need to force a call to our callback here in case
71048 +        * xend already configured us!
71049 +        */
71050 +       pciback_be_watch(&pdev->be_watch, NULL, 0);
71051 +
71052 +      out:
71053 +       return err;
71054 +}
71055 +
71056 +static int pciback_xenbus_remove(struct xenbus_device *dev)
71057 +{
71058 +       struct pciback_device *pdev = dev->dev.driver_data;
71059 +
71060 +       if (pdev != NULL)
71061 +               free_pdev(pdev);
71062 +
71063 +       return 0;
71064 +}
71065 +
71066 +static struct xenbus_device_id xenpci_ids[] = {
71067 +       {"pci"},
71068 +       {{0}},
71069 +};
71070 +
71071 +static struct xenbus_driver xenbus_pciback_driver = {
71072 +       .name                   = "pciback",
71073 +       .owner                  = THIS_MODULE,
71074 +       .ids                    = xenpci_ids,
71075 +       .probe                  = pciback_xenbus_probe,
71076 +       .remove                 = pciback_xenbus_remove,
71077 +       .otherend_changed       = pciback_frontend_changed,
71078 +};
71079 +
71080 +int __init pciback_xenbus_register(void)
71081 +{
71082 +       if (!is_running_on_xen())
71083 +               return -ENODEV;
71084 +
71085 +       return xenbus_register_backend(&xenbus_pciback_driver);
71086 +}
71087 +
71088 +void __exit pciback_xenbus_unregister(void)
71089 +{
71090 +       xenbus_unregister_driver(&xenbus_pciback_driver);
71091 +}
71092 diff -ruNp linux-2.6.19/drivers/xen/pcifront/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/Makefile
71093 --- linux-2.6.19/drivers/xen/pcifront/Makefile  1970-01-01 00:00:00.000000000 +0000
71094 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/Makefile        2007-02-02 19:10:45.000000000 +0000
71095 @@ -0,0 +1,7 @@
71096 +obj-y += pcifront.o
71097 +
71098 +pcifront-y := pci_op.o xenbus.o pci.o
71099 +
71100 +ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y)
71101 +EXTRA_CFLAGS += -DDEBUG
71102 +endif
71103 diff -ruNp linux-2.6.19/drivers/xen/pcifront/pci.c linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/pci.c
71104 --- linux-2.6.19/drivers/xen/pcifront/pci.c     1970-01-01 00:00:00.000000000 +0000
71105 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/pci.c   2007-02-02 19:10:45.000000000 +0000
71106 @@ -0,0 +1,46 @@
71107 +/*
71108 + * PCI Frontend Operations - ensure only one PCI frontend runs at a time
71109 + *
71110 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
71111 + */
71112 +#include <linux/module.h>
71113 +#include <linux/init.h>
71114 +#include <linux/pci.h>
71115 +#include <linux/spinlock.h>
71116 +#include "pcifront.h"
71117 +
71118 +DEFINE_SPINLOCK(pcifront_dev_lock);
71119 +static struct pcifront_device *pcifront_dev = NULL;
71120 +
71121 +int pcifront_connect(struct pcifront_device *pdev)
71122 +{
71123 +       int err = 0;
71124 +
71125 +       spin_lock(&pcifront_dev_lock);
71126 +
71127 +       if (!pcifront_dev) {
71128 +               dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
71129 +               pcifront_dev = pdev;
71130 +       }
71131 +       else {
71132 +               dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
71133 +               err = -EEXIST;
71134 +       }
71135 +
71136 +       spin_unlock(&pcifront_dev_lock);
71137 +
71138 +       return err;
71139 +}
71140 +
71141 +void pcifront_disconnect(struct pcifront_device *pdev)
71142 +{
71143 +       spin_lock(&pcifront_dev_lock);
71144 +
71145 +       if (pdev == pcifront_dev) {
71146 +               dev_info(&pdev->xdev->dev,
71147 +                        "Disconnecting PCI Frontend Buses\n");
71148 +               pcifront_dev = NULL;
71149 +       }
71150 +
71151 +       spin_unlock(&pcifront_dev_lock);
71152 +}
71153 diff -ruNp linux-2.6.19/drivers/xen/pcifront/pci_op.c linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/pci_op.c
71154 --- linux-2.6.19/drivers/xen/pcifront/pci_op.c  1970-01-01 00:00:00.000000000 +0000
71155 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/pci_op.c        2007-02-02 19:10:45.000000000 +0000
71156 @@ -0,0 +1,273 @@
71157 +/*
71158 + * PCI Frontend Operations - Communicates with frontend
71159 + *
71160 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
71161 + */
71162 +#include <linux/module.h>
71163 +#include <linux/version.h>
71164 +#include <linux/init.h>
71165 +#include <linux/pci.h>
71166 +#include <linux/spinlock.h>
71167 +#include <linux/time.h>
71168 +#include <xen/evtchn.h>
71169 +#include "pcifront.h"
71170 +
71171 +static int verbose_request = 0;
71172 +module_param(verbose_request, int, 0644);
71173 +
71174 +static int errno_to_pcibios_err(int errno)
71175 +{
71176 +       switch (errno) {
71177 +       case XEN_PCI_ERR_success:
71178 +               return PCIBIOS_SUCCESSFUL;
71179 +
71180 +       case XEN_PCI_ERR_dev_not_found:
71181 +               return PCIBIOS_DEVICE_NOT_FOUND;
71182 +
71183 +       case XEN_PCI_ERR_invalid_offset:
71184 +       case XEN_PCI_ERR_op_failed:
71185 +               return PCIBIOS_BAD_REGISTER_NUMBER;
71186 +
71187 +       case XEN_PCI_ERR_not_implemented:
71188 +               return PCIBIOS_FUNC_NOT_SUPPORTED;
71189 +
71190 +       case XEN_PCI_ERR_access_denied:
71191 +               return PCIBIOS_SET_FAILED;
71192 +       }
71193 +       return errno;
71194 +}
71195 +
71196 +static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
71197 +{
71198 +       int err = 0;
71199 +       struct xen_pci_op *active_op = &pdev->sh_info->op;
71200 +       unsigned long irq_flags;
71201 +       evtchn_port_t port = pdev->evtchn;
71202 +       s64 ns, ns_timeout;
71203 +       struct timeval tv;
71204 +
71205 +       spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
71206 +
71207 +       memcpy(active_op, op, sizeof(struct xen_pci_op));
71208 +
71209 +       /* Go */
71210 +       wmb();
71211 +       set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
71212 +       notify_remote_via_evtchn(port);
71213 +
71214 +       /*
71215 +        * We set a poll timeout of 3 seconds but give up on return after
71216 +        * 2 seconds. It is better to time out too late rather than too early
71217 +        * (in the latter case we end up continually re-executing poll() with a
71218 +        * timeout in the past). 1s difference gives plenty of slack for error.
71219 +        */
71220 +       do_gettimeofday(&tv);
71221 +       ns_timeout = timeval_to_ns(&tv) + 2 * NSEC_PER_SEC;
71222 +
71223 +       clear_evtchn(port);
71224 +
71225 +       while (test_bit(_XEN_PCIF_active,
71226 +                       (unsigned long *)&pdev->sh_info->flags)) {
71227 +               if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
71228 +                       BUG();
71229 +               clear_evtchn(port);
71230 +               do_gettimeofday(&tv);
71231 +               ns = timeval_to_ns(&tv);
71232 +               if (ns > ns_timeout) {
71233 +                       dev_err(&pdev->xdev->dev,
71234 +                               "pciback not responding!!!\n");
71235 +                       clear_bit(_XEN_PCIF_active,
71236 +                                 (unsigned long *)&pdev->sh_info->flags);
71237 +                       err = XEN_PCI_ERR_dev_not_found;
71238 +                       goto out;
71239 +               }
71240 +       }
71241 +
71242 +       memcpy(op, active_op, sizeof(struct xen_pci_op));
71243 +
71244 +       err = op->err;
71245 +      out:
71246 +       spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
71247 +       return err;
71248 +}
71249 +
71250 +/* Access to this function is spinlocked in drivers/pci/access.c */
71251 +static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
71252 +                            int where, int size, u32 * val)
71253 +{
71254 +       int err = 0;
71255 +       struct xen_pci_op op = {
71256 +               .cmd    = XEN_PCI_OP_conf_read,
71257 +               .domain = pci_domain_nr(bus),
71258 +               .bus    = bus->number,
71259 +               .devfn  = devfn,
71260 +               .offset = where,
71261 +               .size   = size,
71262 +       };
71263 +       struct pcifront_sd *sd = bus->sysdata;
71264 +       struct pcifront_device *pdev = pcifront_get_pdev(sd);
71265 +
71266 +       if (verbose_request)
71267 +               dev_info(&pdev->xdev->dev,
71268 +                        "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
71269 +                        pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
71270 +                        PCI_FUNC(devfn), where, size);
71271 +
71272 +       err = do_pci_op(pdev, &op);
71273 +
71274 +       if (likely(!err)) {
71275 +               if (verbose_request)
71276 +                       dev_info(&pdev->xdev->dev, "read got back value %x\n",
71277 +                                op.value);
71278 +
71279 +               *val = op.value;
71280 +       } else if (err == -ENODEV) {
71281 +               /* No device here, pretend that it just returned 0 */
71282 +               err = 0;
71283 +               *val = 0;
71284 +       }
71285 +
71286 +       return errno_to_pcibios_err(err);
71287 +}
71288 +
71289 +/* Access to this function is spinlocked in drivers/pci/access.c */
71290 +static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
71291 +                             int where, int size, u32 val)
71292 +{
71293 +       struct xen_pci_op op = {
71294 +               .cmd    = XEN_PCI_OP_conf_write,
71295 +               .domain = pci_domain_nr(bus),
71296 +               .bus    = bus->number,
71297 +               .devfn  = devfn,
71298 +               .offset = where,
71299 +               .size   = size,
71300 +               .value  = val,
71301 +       };
71302 +       struct pcifront_sd *sd = bus->sysdata;
71303 +       struct pcifront_device *pdev = pcifront_get_pdev(sd);
71304 +
71305 +       if (verbose_request)
71306 +               dev_info(&pdev->xdev->dev,
71307 +                        "write dev=%04x:%02x:%02x.%01x - "
71308 +                        "offset %x size %d val %x\n",
71309 +                        pci_domain_nr(bus), bus->number,
71310 +                        PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
71311 +
71312 +       return errno_to_pcibios_err(do_pci_op(pdev, &op));
71313 +}
71314 +
71315 +struct pci_ops pcifront_bus_ops = {
71316 +       .read = pcifront_bus_read,
71317 +       .write = pcifront_bus_write,
71318 +};
71319 +
71320 +/* Claim resources for the PCI frontend as-is, backend won't allow changes */
71321 +static void pcifront_claim_resource(struct pci_dev *dev, void *data)
71322 +{
71323 +       struct pcifront_device *pdev = data;
71324 +       int i;
71325 +       struct resource *r;
71326 +
71327 +       for (i = 0; i < PCI_NUM_RESOURCES; i++) {
71328 +               r = &dev->resource[i];
71329 +
71330 +               if (!r->parent && r->start && r->flags) {
71331 +                       dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
71332 +                               pci_name(dev), i);
71333 +                       pci_claim_resource(dev, i);
71334 +               }
71335 +       }
71336 +}
71337 +
71338 +int pcifront_scan_root(struct pcifront_device *pdev,
71339 +                      unsigned int domain, unsigned int bus)
71340 +{
71341 +       struct pci_bus *b;
71342 +       struct pcifront_sd *sd = NULL;
71343 +       struct pci_bus_entry *bus_entry = NULL;
71344 +       int err = 0;
71345 +
71346 +#ifndef CONFIG_PCI_DOMAINS
71347 +       if (domain != 0) {
71348 +               dev_err(&pdev->xdev->dev,
71349 +                       "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
71350 +               dev_err(&pdev->xdev->dev,
71351 +                       "Please compile with CONFIG_PCI_DOMAINS\n");
71352 +               err = -EINVAL;
71353 +               goto err_out;
71354 +       }
71355 +#endif
71356 +
71357 +       dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
71358 +                domain, bus);
71359 +
71360 +       bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
71361 +       sd = kmalloc(sizeof(*sd), GFP_KERNEL);
71362 +       if (!bus_entry || !sd) {
71363 +               err = -ENOMEM;
71364 +               goto err_out;
71365 +       }
71366 +       pcifront_init_sd(sd, domain, pdev);
71367 +
71368 +       b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
71369 +                                 &pcifront_bus_ops, sd);
71370 +       if (!b) {
71371 +               dev_err(&pdev->xdev->dev,
71372 +                       "Error creating PCI Frontend Bus!\n");
71373 +               err = -ENOMEM;
71374 +               goto err_out;
71375 +       }
71376 +       bus_entry->bus = b;
71377 +
71378 +       list_add(&bus_entry->list, &pdev->root_buses);
71379 +
71380 +       /* Claim resources before going "live" with our devices */
71381 +       pci_walk_bus(b, pcifront_claim_resource, pdev);
71382 +
71383 +       pci_bus_add_devices(b);
71384 +
71385 +       return 0;
71386 +
71387 +      err_out:
71388 +       kfree(bus_entry);
71389 +       kfree(sd);
71390 +
71391 +       return err;
71392 +}
71393 +
71394 +static void free_root_bus_devs(struct pci_bus *bus)
71395 +{
71396 +       struct pci_dev *dev;
71397 +
71398 +       down_write(&pci_bus_sem);
71399 +       while (!list_empty(&bus->devices)) {
71400 +               dev = container_of(bus->devices.next, struct pci_dev, bus_list);
71401 +               up_write(&pci_bus_sem);
71402 +
71403 +               dev_dbg(&dev->dev, "removing device\n");
71404 +               pci_remove_bus_device(dev);
71405 +
71406 +               down_write(&pci_bus_sem);
71407 +       }
71408 +       up_write(&pci_bus_sem);
71409 +}
71410 +
71411 +void pcifront_free_roots(struct pcifront_device *pdev)
71412 +{
71413 +       struct pci_bus_entry *bus_entry, *t;
71414 +
71415 +       dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
71416 +
71417 +       list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
71418 +               list_del(&bus_entry->list);
71419 +
71420 +               free_root_bus_devs(bus_entry->bus);
71421 +
71422 +               kfree(bus_entry->bus->sysdata);
71423 +
71424 +               device_unregister(bus_entry->bus->bridge);
71425 +               pci_remove_bus(bus_entry->bus);
71426 +
71427 +               kfree(bus_entry);
71428 +       }
71429 +}
71430 diff -ruNp linux-2.6.19/drivers/xen/pcifront/pcifront.h linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/pcifront.h
71431 --- linux-2.6.19/drivers/xen/pcifront/pcifront.h        1970-01-01 00:00:00.000000000 +0000
71432 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/pcifront.h      2007-02-02 19:10:45.000000000 +0000
71433 @@ -0,0 +1,40 @@
71434 +/*
71435 + * PCI Frontend - Common data structures & function declarations
71436 + *
71437 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
71438 + */
71439 +#ifndef __XEN_PCIFRONT_H__
71440 +#define __XEN_PCIFRONT_H__
71441 +
71442 +#include <linux/spinlock.h>
71443 +#include <linux/pci.h>
71444 +#include <xen/xenbus.h>
71445 +#include <xen/interface/io/pciif.h>
71446 +#include <xen/pcifront.h>
71447 +
71448 +struct pci_bus_entry {
71449 +       struct list_head list;
71450 +       struct pci_bus *bus;
71451 +};
71452 +
71453 +struct pcifront_device {
71454 +       struct xenbus_device *xdev;
71455 +       struct list_head root_buses;
71456 +       spinlock_t dev_lock;
71457 +
71458 +       int evtchn;
71459 +       int gnt_ref;
71460 +
71461 +       /* Lock this when doing any operations in sh_info */
71462 +       spinlock_t sh_info_lock;
71463 +       struct xen_pci_sharedinfo *sh_info;
71464 +};
71465 +
71466 +int pcifront_connect(struct pcifront_device *pdev);
71467 +void pcifront_disconnect(struct pcifront_device *pdev);
71468 +
71469 +int pcifront_scan_root(struct pcifront_device *pdev,
71470 +                      unsigned int domain, unsigned int bus);
71471 +void pcifront_free_roots(struct pcifront_device *pdev);
71472 +
71473 +#endif /* __XEN_PCIFRONT_H__ */
71474 diff -ruNp linux-2.6.19/drivers/xen/pcifront/xenbus.c linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/xenbus.c
71475 --- linux-2.6.19/drivers/xen/pcifront/xenbus.c  1970-01-01 00:00:00.000000000 +0000
71476 +++ linux-2.6.19-xen-3.0.4/drivers/xen/pcifront/xenbus.c        2007-02-02 19:10:45.000000000 +0000
71477 @@ -0,0 +1,295 @@
71478 +/*
71479 + * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
71480 + *
71481 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
71482 + */
71483 +#include <linux/module.h>
71484 +#include <linux/init.h>
71485 +#include <linux/mm.h>
71486 +#include <xen/xenbus.h>
71487 +#include <xen/gnttab.h>
71488 +#include "pcifront.h"
71489 +
71490 +#define INVALID_GRANT_REF (0)
71491 +#define INVALID_EVTCHN    (-1)
71492 +
71493 +static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
71494 +{
71495 +       struct pcifront_device *pdev;
71496 +
71497 +       pdev = kmalloc(sizeof(struct pcifront_device), GFP_KERNEL);
71498 +       if (pdev == NULL)
71499 +               goto out;
71500 +
71501 +       pdev->sh_info =
71502 +           (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
71503 +       if (pdev->sh_info == NULL) {
71504 +               kfree(pdev);
71505 +               pdev = NULL;
71506 +               goto out;
71507 +       }
71508 +       pdev->sh_info->flags = 0;
71509 +
71510 +       xdev->dev.driver_data = pdev;
71511 +       pdev->xdev = xdev;
71512 +
71513 +       INIT_LIST_HEAD(&pdev->root_buses);
71514 +
71515 +       spin_lock_init(&pdev->dev_lock);
71516 +       spin_lock_init(&pdev->sh_info_lock);
71517 +
71518 +       pdev->evtchn = INVALID_EVTCHN;
71519 +       pdev->gnt_ref = INVALID_GRANT_REF;
71520 +
71521 +       dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
71522 +               pdev, pdev->sh_info);
71523 +      out:
71524 +       return pdev;
71525 +}
71526 +
71527 +static void free_pdev(struct pcifront_device *pdev)
71528 +{
71529 +       dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
71530 +
71531 +       pcifront_free_roots(pdev);
71532 +
71533 +       if (pdev->evtchn != INVALID_EVTCHN)
71534 +               xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
71535 +
71536 +       if (pdev->gnt_ref != INVALID_GRANT_REF)
71537 +               gnttab_end_foreign_access(pdev->gnt_ref, 0,
71538 +                                         (unsigned long)pdev->sh_info);
71539 +
71540 +       pdev->xdev->dev.driver_data = NULL;
71541 +
71542 +       kfree(pdev);
71543 +}
71544 +
71545 +static int pcifront_publish_info(struct pcifront_device *pdev)
71546 +{
71547 +       int err = 0;
71548 +       struct xenbus_transaction trans;
71549 +
71550 +       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
71551 +       if (err < 0)
71552 +               goto out;
71553 +
71554 +       pdev->gnt_ref = err;
71555 +
71556 +       err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
71557 +       if (err)
71558 +               goto out;
71559 +
71560 +      do_publish:
71561 +       err = xenbus_transaction_start(&trans);
71562 +       if (err) {
71563 +               xenbus_dev_fatal(pdev->xdev, err,
71564 +                                "Error writing configuration for backend "
71565 +                                "(start transaction)");
71566 +               goto out;
71567 +       }
71568 +
71569 +       err = xenbus_printf(trans, pdev->xdev->nodename,
71570 +                           "pci-op-ref", "%u", pdev->gnt_ref);
71571 +       if (!err)
71572 +               err = xenbus_printf(trans, pdev->xdev->nodename,
71573 +                                   "event-channel", "%u", pdev->evtchn);
71574 +       if (!err)
71575 +               err = xenbus_printf(trans, pdev->xdev->nodename,
71576 +                                   "magic", XEN_PCI_MAGIC);
71577 +
71578 +       if (err) {
71579 +               xenbus_transaction_end(trans, 1);
71580 +               xenbus_dev_fatal(pdev->xdev, err,
71581 +                                "Error writing configuration for backend");
71582 +               goto out;
71583 +       } else {
71584 +               err = xenbus_transaction_end(trans, 0);
71585 +               if (err == -EAGAIN)
71586 +                       goto do_publish;
71587 +               else if (err) {
71588 +                       xenbus_dev_fatal(pdev->xdev, err,
71589 +                                        "Error completing transaction "
71590 +                                        "for backend");
71591 +                       goto out;
71592 +               }
71593 +       }
71594 +
71595 +       xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
71596 +
71597 +       dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
71598 +
71599 +      out:
71600 +       return err;
71601 +}
71602 +
71603 +static int pcifront_try_connect(struct pcifront_device *pdev)
71604 +{
71605 +       int err = -EFAULT;
71606 +       int i, num_roots, len;
71607 +       char str[64];
71608 +       unsigned int domain, bus;
71609 +
71610 +       spin_lock(&pdev->dev_lock);
71611 +
71612 +       /* Only connect once */
71613 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
71614 +           XenbusStateInitialised)
71615 +               goto out;
71616 +
71617 +       err = pcifront_connect(pdev);
71618 +       if (err) {
71619 +               xenbus_dev_fatal(pdev->xdev, err,
71620 +                                "Error connecting PCI Frontend");
71621 +               goto out;
71622 +       }
71623 +
71624 +       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
71625 +                          "root_num", "%d", &num_roots);
71626 +       if (err == -ENOENT) {
71627 +               xenbus_dev_error(pdev->xdev, err,
71628 +                                "No PCI Roots found, trying 0000:00");
71629 +               err = pcifront_scan_root(pdev, 0, 0);
71630 +               num_roots = 0;
71631 +       } else if (err != 1) {
71632 +               if (err == 0)
71633 +                       err = -EINVAL;
71634 +               xenbus_dev_fatal(pdev->xdev, err,
71635 +                                "Error reading number of PCI roots");
71636 +               goto out;
71637 +       }
71638 +
71639 +       for (i = 0; i < num_roots; i++) {
71640 +               len = snprintf(str, sizeof(str), "root-%d", i);
71641 +               if (unlikely(len >= (sizeof(str) - 1))) {
71642 +                       err = -ENOMEM;
71643 +                       goto out;
71644 +               }
71645 +
71646 +               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
71647 +                                  "%x:%x", &domain, &bus);
71648 +               if (err != 2) {
71649 +                       if (err >= 0)
71650 +                               err = -EINVAL;
71651 +                       xenbus_dev_fatal(pdev->xdev, err,
71652 +                                        "Error reading PCI root %d", i);
71653 +                       goto out;
71654 +               }
71655 +
71656 +               err = pcifront_scan_root(pdev, domain, bus);
71657 +               if (err) {
71658 +                       xenbus_dev_fatal(pdev->xdev, err,
71659 +                                        "Error scanning PCI root %04x:%02x",
71660 +                                        domain, bus);
71661 +                       goto out;
71662 +               }
71663 +       }
71664 +
71665 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
71666 +       if (err)
71667 +               goto out;
71668 +
71669 +      out:
71670 +       spin_unlock(&pdev->dev_lock);
71671 +       return err;
71672 +}
71673 +
71674 +static int pcifront_try_disconnect(struct pcifront_device *pdev)
71675 +{
71676 +       int err = 0;
71677 +       enum xenbus_state prev_state;
71678 +
71679 +       spin_lock(&pdev->dev_lock);
71680 +
71681 +       prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
71682 +
71683 +       if (prev_state < XenbusStateClosing)
71684 +               err = xenbus_switch_state(pdev->xdev, XenbusStateClosing);
71685 +
71686 +       if (!err && prev_state == XenbusStateConnected)
71687 +               pcifront_disconnect(pdev);
71688 +
71689 +       spin_unlock(&pdev->dev_lock);
71690 +
71691 +       return err;
71692 +}
71693 +
71694 +static void pcifront_backend_changed(struct xenbus_device *xdev,
71695 +                                    enum xenbus_state be_state)
71696 +{
71697 +       struct pcifront_device *pdev = xdev->dev.driver_data;
71698 +
71699 +       switch (be_state) {
71700 +       case XenbusStateClosing:
71701 +               dev_warn(&xdev->dev, "backend going away!\n");
71702 +               pcifront_try_disconnect(pdev);
71703 +               break;
71704 +
71705 +       case XenbusStateUnknown:
71706 +       case XenbusStateClosed:
71707 +               dev_warn(&xdev->dev, "backend went away!\n");
71708 +               pcifront_try_disconnect(pdev);
71709 +
71710 +               device_unregister(&pdev->xdev->dev);
71711 +               break;
71712 +
71713 +       case XenbusStateConnected:
71714 +               pcifront_try_connect(pdev);
71715 +               break;
71716 +
71717 +       default:
71718 +               break;
71719 +       }
71720 +}
71721 +
71722 +static int pcifront_xenbus_probe(struct xenbus_device *xdev,
71723 +                                const struct xenbus_device_id *id)
71724 +{
71725 +       int err = 0;
71726 +       struct pcifront_device *pdev = alloc_pdev(xdev);
71727 +
71728 +       if (pdev == NULL) {
71729 +               err = -ENOMEM;
71730 +               xenbus_dev_fatal(xdev, err,
71731 +                                "Error allocating pcifront_device struct");
71732 +               goto out;
71733 +       }
71734 +
71735 +       err = pcifront_publish_info(pdev);
71736 +
71737 +      out:
71738 +       return err;
71739 +}
71740 +
71741 +static int pcifront_xenbus_remove(struct xenbus_device *xdev)
71742 +{
71743 +       if (xdev->dev.driver_data)
71744 +               free_pdev(xdev->dev.driver_data);
71745 +
71746 +       return 0;
71747 +}
71748 +
71749 +static struct xenbus_device_id xenpci_ids[] = {
71750 +       {"pci"},
71751 +       {{0}},
71752 +};
71753 +
71754 +static struct xenbus_driver xenbus_pcifront_driver = {
71755 +       .name                   = "pcifront",
71756 +       .owner                  = THIS_MODULE,
71757 +       .ids                    = xenpci_ids,
71758 +       .probe                  = pcifront_xenbus_probe,
71759 +       .remove                 = pcifront_xenbus_remove,
71760 +       .otherend_changed       = pcifront_backend_changed,
71761 +};
71762 +
71763 +static int __init pcifront_init(void)
71764 +{
71765 +       if (!is_running_on_xen())
71766 +               return -ENODEV;
71767 +
71768 +       return xenbus_register_frontend(&xenbus_pcifront_driver);
71769 +}
71770 +
71771 +/* Initialize after the Xen PCI Frontend Stub is initialized */
71772 +subsys_initcall(pcifront_init);
71773 diff -ruNp linux-2.6.19/drivers/xen/privcmd/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/privcmd/Makefile
71774 --- linux-2.6.19/drivers/xen/privcmd/Makefile   1970-01-01 00:00:00.000000000 +0000
71775 +++ linux-2.6.19-xen-3.0.4/drivers/xen/privcmd/Makefile 2007-02-02 19:10:45.000000000 +0000
71776 @@ -0,0 +1,2 @@
71777 +
71778 +obj-$(CONFIG_XEN_PRIVCMD)      := privcmd.o
71779 diff -ruNp linux-2.6.19/drivers/xen/privcmd/privcmd.c linux-2.6.19-xen-3.0.4/drivers/xen/privcmd/privcmd.c
71780 --- linux-2.6.19/drivers/xen/privcmd/privcmd.c  1970-01-01 00:00:00.000000000 +0000
71781 +++ linux-2.6.19-xen-3.0.4/drivers/xen/privcmd/privcmd.c        2007-02-02 19:10:45.000000000 +0000
71782 @@ -0,0 +1,285 @@
71783 +/******************************************************************************
71784 + * privcmd.c
71785 + * 
71786 + * Interface to privileged domain-0 commands.
71787 + * 
71788 + * Copyright (c) 2002-2004, K A Fraser, B Dragovic
71789 + */
71790 +
71791 +#include <linux/kernel.h>
71792 +#include <linux/sched.h>
71793 +#include <linux/slab.h>
71794 +#include <linux/string.h>
71795 +#include <linux/errno.h>
71796 +#include <linux/mm.h>
71797 +#include <linux/mman.h>
71798 +#include <linux/swap.h>
71799 +#include <linux/smp_lock.h>
71800 +#include <linux/highmem.h>
71801 +#include <linux/pagemap.h>
71802 +#include <linux/seq_file.h>
71803 +#include <linux/kthread.h>
71804 +#include <asm/hypervisor.h>
71805 +
71806 +#include <asm/pgalloc.h>
71807 +#include <asm/pgtable.h>
71808 +#include <asm/uaccess.h>
71809 +#include <asm/tlb.h>
71810 +#include <asm/hypervisor.h>
71811 +#include <xen/public/privcmd.h>
71812 +#include <xen/interface/xen.h>
71813 +#include <xen/interface/dom0_ops.h>
71814 +#include <xen/xen_proc.h>
71815 +
71816 +static struct proc_dir_entry *privcmd_intf;
71817 +static struct proc_dir_entry *capabilities_intf;
71818 +
71819 +#ifndef HAVE_ARCH_PRIVCMD_MMAP
71820 +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
71821 +#endif
71822 +
71823 +static int privcmd_ioctl(struct inode *inode, struct file *file,
71824 +                        unsigned int cmd, unsigned long data)
71825 +{
71826 +       int ret = -ENOSYS;
71827 +       void __user *udata = (void __user *) data;
71828 +
71829 +       switch (cmd) {
71830 +       case IOCTL_PRIVCMD_HYPERCALL: {
71831 +               privcmd_hypercall_t hypercall;
71832 +  
71833 +               if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
71834 +                       return -EFAULT;
71835 +
71836 +#if defined(__i386__)
71837 +               if (hypercall.op >= (PAGE_SIZE >> 5))
71838 +                       break;
71839 +               __asm__ __volatile__ (
71840 +                       "pushl %%ebx; pushl %%ecx; pushl %%edx; "
71841 +                       "pushl %%esi; pushl %%edi; "
71842 +                       "movl  8(%%eax),%%ebx ;"
71843 +                       "movl 16(%%eax),%%ecx ;"
71844 +                       "movl 24(%%eax),%%edx ;"
71845 +                       "movl 32(%%eax),%%esi ;"
71846 +                       "movl 40(%%eax),%%edi ;"
71847 +                       "movl   (%%eax),%%eax ;"
71848 +                       "shll $5,%%eax ;"
71849 +                       "addl $hypercall_page,%%eax ;"
71850 +                       "call *%%eax ;"
71851 +                       "popl %%edi; popl %%esi; popl %%edx; "
71852 +                       "popl %%ecx; popl %%ebx"
71853 +                       : "=a" (ret) : "0" (&hypercall) : "memory" );
71854 +#elif defined (__x86_64__)
71855 +               if (hypercall.op < (PAGE_SIZE >> 5)) {
71856 +                       long ign1, ign2, ign3;
71857 +                       __asm__ __volatile__ (
71858 +                               "movq %8,%%r10; movq %9,%%r8;"
71859 +                               "shll $5,%%eax ;"
71860 +                               "addq $hypercall_page,%%rax ;"
71861 +                               "call *%%rax"
71862 +                               : "=a" (ret), "=D" (ign1),
71863 +                                 "=S" (ign2), "=d" (ign3)
71864 +                               : "0" ((unsigned int)hypercall.op),
71865 +                               "1" (hypercall.arg[0]),
71866 +                               "2" (hypercall.arg[1]),
71867 +                               "3" (hypercall.arg[2]),
71868 +                               "g" (hypercall.arg[3]),
71869 +                               "g" (hypercall.arg[4])
71870 +                               : "r8", "r10", "memory" );
71871 +               }
71872 +#elif defined (__ia64__)
71873 +               ret = privcmd_hypercall(&hypercall);
71874 +#endif
71875 +       }
71876 +       break;
71877 +
71878 +       case IOCTL_PRIVCMD_MMAP: {
71879 +               privcmd_mmap_t mmapcmd;
71880 +               privcmd_mmap_entry_t msg;
71881 +               privcmd_mmap_entry_t __user *p;
71882 +               struct mm_struct *mm = current->mm;
71883 +               struct vm_area_struct *vma;
71884 +               unsigned long va;
71885 +               int i, rc;
71886 +
71887 +               if (!is_initial_xendomain())
71888 +                       return -EPERM;
71889 +
71890 +               if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
71891 +                       return -EFAULT;
71892 +
71893 +               p = mmapcmd.entry;
71894 +               if (copy_from_user(&msg, p, sizeof(msg)))
71895 +                       return -EFAULT;
71896 +
71897 +               down_read(&mm->mmap_sem);
71898 +
71899 +               vma = find_vma(mm, msg.va);
71900 +               rc = -EINVAL;
71901 +               if (!vma || (msg.va != vma->vm_start) ||
71902 +                   !privcmd_enforce_singleshot_mapping(vma))
71903 +                       goto mmap_out;
71904 +
71905 +               va = vma->vm_start;
71906 +
71907 +               for (i = 0; i < mmapcmd.num; i++) {
71908 +                       rc = -EFAULT;
71909 +                       if (copy_from_user(&msg, p, sizeof(msg)))
71910 +                               goto mmap_out;
71911 +
71912 +                       /* Do not allow range to wrap the address space. */
71913 +                       rc = -EINVAL;
71914 +                       if ((msg.npages > (LONG_MAX >> PAGE_SHIFT)) ||
71915 +                           ((unsigned long)(msg.npages << PAGE_SHIFT) >= -va))
71916 +                               goto mmap_out;
71917 +
71918 +                       /* Range chunks must be contiguous in va space. */
71919 +                       if ((msg.va != va) ||
71920 +                           ((msg.va+(msg.npages<<PAGE_SHIFT)) > vma->vm_end))
71921 +                               goto mmap_out;
71922 +
71923 +                       if ((rc = direct_remap_pfn_range(
71924 +                               vma,
71925 +                               msg.va & PAGE_MASK, 
71926 +                               msg.mfn, 
71927 +                               msg.npages << PAGE_SHIFT, 
71928 +                               vma->vm_page_prot,
71929 +                               mmapcmd.dom)) < 0)
71930 +                               goto mmap_out;
71931 +
71932 +                       p++;
71933 +                       va += msg.npages << PAGE_SHIFT;
71934 +               }
71935 +
71936 +               rc = 0;
71937 +
71938 +       mmap_out:
71939 +               up_read(&mm->mmap_sem);
71940 +               ret = rc;
71941 +       }
71942 +       break;
71943 +
71944 +       case IOCTL_PRIVCMD_MMAPBATCH: {
71945 +               privcmd_mmapbatch_t m;
71946 +               struct mm_struct *mm = current->mm;
71947 +               struct vm_area_struct *vma;
71948 +               xen_pfn_t __user *p;
71949 +               unsigned long addr, mfn, nr_pages;
71950 +               int i;
71951 +
71952 +               if (!is_initial_xendomain())
71953 +                       return -EPERM;
71954 +
71955 +               if (copy_from_user(&m, udata, sizeof(m)))
71956 +                       return -EFAULT;
71957 +
71958 +               nr_pages = m.num;
71959 +               if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
71960 +                       return -EINVAL;
71961 +
71962 +               down_read(&mm->mmap_sem);
71963 +
71964 +               vma = find_vma(mm, m.addr);
71965 +               if (!vma ||
71966 +                   (m.addr != vma->vm_start) ||
71967 +                   ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
71968 +                   !privcmd_enforce_singleshot_mapping(vma)) {
71969 +                       up_read(&mm->mmap_sem);
71970 +                       return -EINVAL;
71971 +               }
71972 +
71973 +               p = m.arr;
71974 +               addr = m.addr;
71975 +               for (i = 0; i < nr_pages; i++, addr += PAGE_SIZE, p++) {
71976 +                       if (get_user(mfn, p)) {
71977 +                               up_read(&mm->mmap_sem);
71978 +                               return -EFAULT;
71979 +                       }
71980 +
71981 +                       ret = direct_remap_pfn_range(vma, addr & PAGE_MASK,
71982 +                                                    mfn, PAGE_SIZE,
71983 +                                                    vma->vm_page_prot, m.dom);
71984 +                       if (ret < 0)
71985 +                               put_user(0xF0000000 | mfn, p);
71986 +               }
71987 +
71988 +               up_read(&mm->mmap_sem);
71989 +               ret = 0;
71990 +       }
71991 +       break;
71992 +
71993 +       default:
71994 +               ret = -EINVAL;
71995 +               break;
71996 +       }
71997 +
71998 +       return ret;
71999 +}
72000 +
72001 +#ifndef HAVE_ARCH_PRIVCMD_MMAP
72002 +static struct page *privcmd_nopage(struct vm_area_struct *vma,
72003 +                                  unsigned long address,
72004 +                                  int *type)
72005 +{
72006 +       return NOPAGE_SIGBUS;
72007 +}
72008 +
72009 +static struct vm_operations_struct privcmd_vm_ops = {
72010 +       .nopage = privcmd_nopage
72011 +};
72012 +
72013 +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
72014 +{
72015 +       /* Unsupported for auto-translate guests. */
72016 +       if (xen_feature(XENFEAT_auto_translated_physmap))
72017 +               return -ENOSYS;
72018 +
72019 +       /* DONTCOPY is essential for Xen as copy_page_range is broken. */
72020 +       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
72021 +       vma->vm_ops = &privcmd_vm_ops;
72022 +       vma->vm_private_data = NULL;
72023 +
72024 +       return 0;
72025 +}
72026 +
72027 +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
72028 +{
72029 +       return (xchg(&vma->vm_private_data, (void *)1) == NULL);
72030 +}
72031 +#endif
72032 +
72033 +static struct file_operations privcmd_file_ops = {
72034 +       .ioctl = privcmd_ioctl,
72035 +       .mmap  = privcmd_mmap,
72036 +};
72037 +
72038 +static int capabilities_read(char *page, char **start, off_t off,
72039 +                            int count, int *eof, void *data)
72040 +{
72041 +       int len = 0;
72042 +       *page = 0;
72043 +
72044 +       if (is_initial_xendomain())
72045 +               len = sprintf( page, "control_d\n" );
72046 +
72047 +       *eof = 1;
72048 +       return len;
72049 +}
72050 +
72051 +static int __init privcmd_init(void)
72052 +{
72053 +       if (!is_running_on_xen())
72054 +               return -ENODEV;
72055 +
72056 +       privcmd_intf = create_xen_proc_entry("privcmd", 0400);
72057 +       if (privcmd_intf != NULL)
72058 +               privcmd_intf->proc_fops = &privcmd_file_ops;
72059 +
72060 +       capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
72061 +       if (capabilities_intf != NULL)
72062 +               capabilities_intf->read_proc = capabilities_read;
72063 +
72064 +       return 0;
72065 +}
72066 +
72067 +__initcall(privcmd_init);
72068 diff -ruNp linux-2.6.19/drivers/xen/tpmback/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/Makefile
72069 --- linux-2.6.19/drivers/xen/tpmback/Makefile   1970-01-01 00:00:00.000000000 +0000
72070 +++ linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/Makefile 2007-02-02 19:10:45.000000000 +0000
72071 @@ -0,0 +1,4 @@
72072 +
72073 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmbk.o
72074 +
72075 +tpmbk-y += tpmback.o interface.o xenbus.o
72076 diff -ruNp linux-2.6.19/drivers/xen/tpmback/common.h linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/common.h
72077 --- linux-2.6.19/drivers/xen/tpmback/common.h   1970-01-01 00:00:00.000000000 +0000
72078 +++ linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/common.h 2007-02-02 19:10:45.000000000 +0000
72079 @@ -0,0 +1,86 @@
72080 +/******************************************************************************
72081 + * drivers/xen/tpmback/common.h
72082 + */
72083 +
72084 +#ifndef __NETIF__BACKEND__COMMON_H__
72085 +#define __NETIF__BACKEND__COMMON_H__
72086 +
72087 +#include <linux/version.h>
72088 +#include <linux/module.h>
72089 +#include <linux/interrupt.h>
72090 +#include <linux/slab.h>
72091 +#include <xen/evtchn.h>
72092 +#include <xen/driver_util.h>
72093 +#include <xen/interface/grant_table.h>
72094 +#include <xen/interface/io/tpmif.h>
72095 +#include <asm/io.h>
72096 +#include <asm/pgalloc.h>
72097 +
72098 +#define DPRINTK(_f, _a...)                     \
72099 +       pr_debug("(file=%s, line=%d) " _f,      \
72100 +                __FILE__ , __LINE__ , ## _a )
72101 +
72102 +struct backend_info;
72103 +
72104 +typedef struct tpmif_st {
72105 +       struct list_head tpmif_list;
72106 +       /* Unique identifier for this interface. */
72107 +       domid_t domid;
72108 +       unsigned int handle;
72109 +
72110 +       /* Physical parameters of the comms window. */
72111 +       unsigned int evtchn;
72112 +       unsigned int irq;
72113 +
72114 +       /* The shared rings and indexes. */
72115 +       tpmif_tx_interface_t *tx;
72116 +       struct vm_struct *tx_area;
72117 +
72118 +       /* Miscellaneous private stuff. */
72119 +       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
72120 +       int active;
72121 +
72122 +       struct tpmif_st *hash_next;
72123 +       struct list_head list;  /* scheduling list */
72124 +       atomic_t refcnt;
72125 +
72126 +       struct backend_info *bi;
72127 +
72128 +       grant_handle_t shmem_handle;
72129 +       grant_ref_t shmem_ref;
72130 +       struct page **mmap_pages;
72131 +
72132 +       char devname[20];
72133 +} tpmif_t;
72134 +
72135 +void tpmif_disconnect_complete(tpmif_t * tpmif);
72136 +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
72137 +void tpmif_interface_init(void);
72138 +void tpmif_interface_exit(void);
72139 +void tpmif_schedule_work(tpmif_t * tpmif);
72140 +void tpmif_deschedule_work(tpmif_t * tpmif);
72141 +void tpmif_xenbus_init(void);
72142 +void tpmif_xenbus_exit(void);
72143 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
72144 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
72145 +
72146 +long int tpmback_get_instance(struct backend_info *bi);
72147 +
72148 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
72149 +
72150 +
72151 +#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
72152 +#define tpmif_put(_b)                                  \
72153 +       do {                                            \
72154 +               if (atomic_dec_and_test(&(_b)->refcnt)) \
72155 +                       tpmif_disconnect_complete(_b);  \
72156 +       } while (0)
72157 +
72158 +extern int num_frontends;
72159 +
72160 +static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx)
72161 +{
72162 +       return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx]));
72163 +}
72164 +
72165 +#endif /* __TPMIF__BACKEND__COMMON_H__ */
72166 diff -ruNp linux-2.6.19/drivers/xen/tpmback/interface.c linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/interface.c
72167 --- linux-2.6.19/drivers/xen/tpmback/interface.c        1970-01-01 00:00:00.000000000 +0000
72168 +++ linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/interface.c      2007-02-02 19:10:45.000000000 +0000
72169 @@ -0,0 +1,182 @@
72170 + /*****************************************************************************
72171 + * drivers/xen/tpmback/interface.c
72172 + *
72173 + * Vritual TPM interface management.
72174 + *
72175 + * Copyright (c) 2005, IBM Corporation
72176 + *
72177 + * Author: Stefan Berger, stefanb@us.ibm.com
72178 + *
72179 + * This code has been derived from drivers/xen/netback/interface.c
72180 + * Copyright (c) 2004, Keir Fraser
72181 + */
72182 +
72183 +#include "common.h"
72184 +#include <xen/balloon.h>
72185 +#include <xen/gnttab.h>
72186 +
72187 +static kmem_cache_t *tpmif_cachep;
72188 +int num_frontends = 0;
72189 +
72190 +LIST_HEAD(tpmif_list);
72191 +
72192 +static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
72193 +{
72194 +       tpmif_t *tpmif;
72195 +
72196 +       tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL);
72197 +       if (tpmif == NULL)
72198 +               goto out_of_memory;
72199 +
72200 +       memset(tpmif, 0, sizeof (*tpmif));
72201 +       tpmif->domid = domid;
72202 +       tpmif->status = DISCONNECTED;
72203 +       tpmif->bi = bi;
72204 +       snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
72205 +       atomic_set(&tpmif->refcnt, 1);
72206 +
72207 +       tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE);
72208 +       if (tpmif->mmap_pages == NULL)
72209 +               goto out_of_memory;
72210 +
72211 +       list_add(&tpmif->tpmif_list, &tpmif_list);
72212 +       num_frontends++;
72213 +
72214 +       return tpmif;
72215 +
72216 + out_of_memory:
72217 +       if (tpmif != NULL)
72218 +               kmem_cache_free(tpmif_cachep, tpmif);
72219 +       printk("%s: out of memory\n", __FUNCTION__);
72220 +       return ERR_PTR(-ENOMEM);
72221 +}
72222 +
72223 +static void free_tpmif(tpmif_t * tpmif)
72224 +{
72225 +       num_frontends--;
72226 +       list_del(&tpmif->tpmif_list);
72227 +       free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE);
72228 +       kmem_cache_free(tpmif_cachep, tpmif);
72229 +}
72230 +
72231 +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
72232 +{
72233 +       tpmif_t *tpmif;
72234 +
72235 +       list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
72236 +               if (tpmif->bi == bi) {
72237 +                       if (tpmif->domid == domid) {
72238 +                               tpmif_get(tpmif);
72239 +                               return tpmif;
72240 +                       } else {
72241 +                               return ERR_PTR(-EEXIST);
72242 +                       }
72243 +               }
72244 +       }
72245 +
72246 +       return alloc_tpmif(domid, bi);
72247 +}
72248 +
72249 +static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page)
72250 +{
72251 +       int ret;
72252 +       struct gnttab_map_grant_ref op;
72253 +
72254 +       gnttab_set_map_op(&op, (unsigned long)tpmif->tx_area->addr,
72255 +                         GNTMAP_host_map, shared_page, tpmif->domid);
72256 +
72257 +       lock_vm_area(tpmif->tx_area);
72258 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
72259 +       unlock_vm_area(tpmif->tx_area);
72260 +       BUG_ON(ret);
72261 +
72262 +       if (op.status) {
72263 +               DPRINTK(" Grant table operation failure !\n");
72264 +               return op.status;
72265 +       }
72266 +
72267 +       tpmif->shmem_ref = shared_page;
72268 +       tpmif->shmem_handle = op.handle;
72269 +
72270 +       return 0;
72271 +}
72272 +
72273 +static void unmap_frontend_page(tpmif_t *tpmif)
72274 +{
72275 +       struct gnttab_unmap_grant_ref op;
72276 +       int ret;
72277 +
72278 +       gnttab_set_unmap_op(&op, (unsigned long)tpmif->tx_area->addr,
72279 +                           GNTMAP_host_map, tpmif->shmem_handle);
72280 +
72281 +       lock_vm_area(tpmif->tx_area);
72282 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
72283 +       unlock_vm_area(tpmif->tx_area);
72284 +       BUG_ON(ret);
72285 +}
72286 +
72287 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn)
72288 +{
72289 +       int err;
72290 +       struct evtchn_bind_interdomain bind_interdomain;
72291 +
72292 +       if (tpmif->irq) {
72293 +               return 0;
72294 +       }
72295 +
72296 +       if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL)
72297 +               return -ENOMEM;
72298 +
72299 +       err = map_frontend_page(tpmif, shared_page);
72300 +       if (err) {
72301 +               free_vm_area(tpmif->tx_area);
72302 +               return err;
72303 +       }
72304 +
72305 +
72306 +       bind_interdomain.remote_dom  = tpmif->domid;
72307 +       bind_interdomain.remote_port = evtchn;
72308 +
72309 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
72310 +                                         &bind_interdomain);
72311 +       if (err) {
72312 +               unmap_frontend_page(tpmif);
72313 +               free_vm_area(tpmif->tx_area);
72314 +               return err;
72315 +       }
72316 +
72317 +       tpmif->evtchn = bind_interdomain.local_port;
72318 +
72319 +       tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr;
72320 +
72321 +       tpmif->irq = bind_evtchn_to_irqhandler(
72322 +               tpmif->evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
72323 +       tpmif->shmem_ref = shared_page;
72324 +       tpmif->active = 1;
72325 +
72326 +       return 0;
72327 +}
72328 +
72329 +void tpmif_disconnect_complete(tpmif_t *tpmif)
72330 +{
72331 +       if (tpmif->irq)
72332 +               unbind_from_irqhandler(tpmif->irq, tpmif);
72333 +
72334 +       if (tpmif->tx) {
72335 +               unmap_frontend_page(tpmif);
72336 +               free_vm_area(tpmif->tx_area);
72337 +       }
72338 +
72339 +       free_tpmif(tpmif);
72340 +}
72341 +
72342 +void __init tpmif_interface_init(void)
72343 +{
72344 +       tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
72345 +                                        0, 0, NULL, NULL);
72346 +}
72347 +
72348 +void __exit tpmif_interface_exit(void)
72349 +{
72350 +       kmem_cache_destroy(tpmif_cachep);
72351 +}
72352 diff -ruNp linux-2.6.19/drivers/xen/tpmback/tpmback.c linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/tpmback.c
72353 --- linux-2.6.19/drivers/xen/tpmback/tpmback.c  1970-01-01 00:00:00.000000000 +0000
72354 +++ linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/tpmback.c        2007-02-02 19:10:45.000000000 +0000
72355 @@ -0,0 +1,944 @@
72356 +/******************************************************************************
72357 + * drivers/xen/tpmback/tpmback.c
72358 + *
72359 + * Copyright (c) 2005, IBM Corporation
72360 + *
72361 + * Author: Stefan Berger, stefanb@us.ibm.com
72362 + * Grant table support: Mahadevan Gomathisankaran
72363 + *
72364 + * This code has been derived from drivers/xen/netback/netback.c
72365 + * Copyright (c) 2002-2004, K A Fraser
72366 + *
72367 + */
72368 +
72369 +#include "common.h"
72370 +#include <xen/evtchn.h>
72371 +
72372 +#include <linux/types.h>
72373 +#include <linux/list.h>
72374 +#include <linux/miscdevice.h>
72375 +#include <linux/poll.h>
72376 +#include <asm/uaccess.h>
72377 +#include <xen/xenbus.h>
72378 +#include <xen/interface/grant_table.h>
72379 +#include <xen/gnttab.h>
72380 +
72381 +/* local data structures */
72382 +struct data_exchange {
72383 +       struct list_head pending_pak;
72384 +       struct list_head current_pak;
72385 +       unsigned int copied_so_far;
72386 +       u8 has_opener:1;
72387 +       u8 aborted:1;
72388 +       rwlock_t pak_lock;      // protects all of the previous fields
72389 +       wait_queue_head_t wait_queue;
72390 +};
72391 +
72392 +struct vtpm_resp_hdr {
72393 +       uint32_t instance_no;
72394 +       uint16_t tag_no;
72395 +       uint32_t len_no;
72396 +       uint32_t ordinal_no;
72397 +} __attribute__ ((packed));
72398 +
72399 +struct packet {
72400 +       struct list_head next;
72401 +       unsigned int data_len;
72402 +       u8 *data_buffer;
72403 +       tpmif_t *tpmif;
72404 +       u32 tpm_instance;
72405 +       u8 req_tag;
72406 +       u32 last_read;
72407 +       u8 flags;
72408 +       struct timer_list processing_timer;
72409 +};
72410 +
72411 +enum {
72412 +       PACKET_FLAG_DISCARD_RESPONSE = 1,
72413 +};
72414 +
72415 +/* local variables */
72416 +static struct data_exchange dataex;
72417 +
72418 +/* local function prototypes */
72419 +static int _packet_write(struct packet *pak,
72420 +                        const char *data, size_t size, int userbuffer);
72421 +static void processing_timeout(unsigned long ptr);
72422 +static int packet_read_shmem(struct packet *pak,
72423 +                            tpmif_t * tpmif,
72424 +                            u32 offset,
72425 +                            char *buffer, int isuserbuffer, u32 left);
72426 +static int vtpm_queue_packet(struct packet *pak);
72427 +
72428 +/***************************************************************
72429 + Buffer copying fo user and kernel space buffes.
72430 +***************************************************************/
72431 +static inline int copy_from_buffer(void *to,
72432 +                                  const void *from, unsigned long size,
72433 +                                  int isuserbuffer)
72434 +{
72435 +       if (isuserbuffer) {
72436 +               if (copy_from_user(to, (void __user *)from, size))
72437 +                       return -EFAULT;
72438 +       } else {
72439 +               memcpy(to, from, size);
72440 +       }
72441 +       return 0;
72442 +}
72443 +
72444 +static inline int copy_to_buffer(void *to,
72445 +                                const void *from, unsigned long size,
72446 +                                int isuserbuffer)
72447 +{
72448 +       if (isuserbuffer) {
72449 +               if (copy_to_user((void __user *)to, from, size))
72450 +                       return -EFAULT;
72451 +       } else {
72452 +               memcpy(to, from, size);
72453 +       }
72454 +       return 0;
72455 +}
72456 +
72457 +
72458 +static void dataex_init(struct data_exchange *dataex)
72459 +{
72460 +       INIT_LIST_HEAD(&dataex->pending_pak);
72461 +       INIT_LIST_HEAD(&dataex->current_pak);
72462 +       dataex->has_opener = 0;
72463 +       rwlock_init(&dataex->pak_lock);
72464 +       init_waitqueue_head(&dataex->wait_queue);
72465 +}
72466 +
72467 +/***************************************************************
72468 + Packet-related functions
72469 +***************************************************************/
72470 +
72471 +static struct packet *packet_find_instance(struct list_head *head,
72472 +                                          u32 tpm_instance)
72473 +{
72474 +       struct packet *pak;
72475 +       struct list_head *p;
72476 +
72477 +       /*
72478 +        * traverse the list of packets and return the first
72479 +        * one with the given instance number
72480 +        */
72481 +       list_for_each(p, head) {
72482 +               pak = list_entry(p, struct packet, next);
72483 +
72484 +               if (pak->tpm_instance == tpm_instance) {
72485 +                       return pak;
72486 +               }
72487 +       }
72488 +       return NULL;
72489 +}
72490 +
72491 +static struct packet *packet_find_packet(struct list_head *head, void *packet)
72492 +{
72493 +       struct packet *pak;
72494 +       struct list_head *p;
72495 +
72496 +       /*
72497 +        * traverse the list of packets and return the first
72498 +        * one with the given instance number
72499 +        */
72500 +       list_for_each(p, head) {
72501 +               pak = list_entry(p, struct packet, next);
72502 +
72503 +               if (pak == packet) {
72504 +                       return pak;
72505 +               }
72506 +       }
72507 +       return NULL;
72508 +}
72509 +
72510 +static struct packet *packet_alloc(tpmif_t * tpmif,
72511 +                                  u32 size, u8 req_tag, u8 flags)
72512 +{
72513 +       struct packet *pak = NULL;
72514 +       pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
72515 +       if (NULL != pak) {
72516 +               if (tpmif) {
72517 +                       pak->tpmif = tpmif;
72518 +                       pak->tpm_instance = tpmback_get_instance(tpmif->bi);
72519 +                       tpmif_get(tpmif);
72520 +               }
72521 +               pak->data_len = size;
72522 +               pak->req_tag = req_tag;
72523 +               pak->last_read = 0;
72524 +               pak->flags = flags;
72525 +
72526 +               /*
72527 +                * cannot do tpmif_get(tpmif); bad things happen
72528 +                * on the last tpmif_put()
72529 +                */
72530 +               init_timer(&pak->processing_timer);
72531 +               pak->processing_timer.function = processing_timeout;
72532 +               pak->processing_timer.data = (unsigned long)pak;
72533 +       }
72534 +       return pak;
72535 +}
72536 +
72537 +static void inline packet_reset(struct packet *pak)
72538 +{
72539 +       pak->last_read = 0;
72540 +}
72541 +
72542 +static void packet_free(struct packet *pak)
72543 +{
72544 +       if (timer_pending(&pak->processing_timer)) {
72545 +               BUG();
72546 +       }
72547 +
72548 +       if (pak->tpmif)
72549 +               tpmif_put(pak->tpmif);
72550 +       kfree(pak->data_buffer);
72551 +       /*
72552 +        * cannot do tpmif_put(pak->tpmif); bad things happen
72553 +        * on the last tpmif_put()
72554 +        */
72555 +       kfree(pak);
72556 +}
72557 +
72558 +
72559 +/*
72560 + * Write data to the shared memory and send it to the FE.
72561 + */
72562 +static int packet_write(struct packet *pak,
72563 +                       const char *data, size_t size, int isuserbuffer)
72564 +{
72565 +       int rc = 0;
72566 +
72567 +       if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
72568 +               /* Don't send a respone to this packet. Just acknowledge it. */
72569 +               rc = size;
72570 +       } else {
72571 +               rc = _packet_write(pak, data, size, isuserbuffer);
72572 +       }
72573 +
72574 +       return rc;
72575 +}
72576 +
72577 +int _packet_write(struct packet *pak,
72578 +                 const char *data, size_t size, int isuserbuffer)
72579 +{
72580 +       /*
72581 +        * Write into the shared memory pages directly
72582 +        * and send it to the front end.
72583 +        */
72584 +       tpmif_t *tpmif = pak->tpmif;
72585 +       grant_handle_t handle;
72586 +       int rc = 0;
72587 +       unsigned int i = 0;
72588 +       unsigned int offset = 0;
72589 +
72590 +       if (tpmif == NULL) {
72591 +               return -EFAULT;
72592 +       }
72593 +
72594 +       if (tpmif->status == DISCONNECTED) {
72595 +               return size;
72596 +       }
72597 +
72598 +       while (offset < size && i < TPMIF_TX_RING_SIZE) {
72599 +               unsigned int tocopy;
72600 +               struct gnttab_map_grant_ref map_op;
72601 +               struct gnttab_unmap_grant_ref unmap_op;
72602 +               tpmif_tx_request_t *tx;
72603 +
72604 +               tx = &tpmif->tx->ring[i].req;
72605 +
72606 +               if (0 == tx->addr) {
72607 +                       DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
72608 +                       return 0;
72609 +               }
72610 +
72611 +               gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
72612 +                                 GNTMAP_host_map, tx->ref, tpmif->domid);
72613 +
72614 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
72615 +                                                      &map_op, 1))) {
72616 +                       BUG();
72617 +               }
72618 +
72619 +               handle = map_op.handle;
72620 +
72621 +               if (map_op.status) {
72622 +                       DPRINTK(" Grant table operation failure !\n");
72623 +                       return 0;
72624 +               }
72625 +
72626 +               tocopy = min_t(size_t, size - offset, PAGE_SIZE);
72627 +
72628 +               if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) |
72629 +                                             (tx->addr & ~PAGE_MASK)),
72630 +                                    &data[offset], tocopy, isuserbuffer)) {
72631 +                       tpmif_put(tpmif);
72632 +                       return -EFAULT;
72633 +               }
72634 +               tx->size = tocopy;
72635 +
72636 +               gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
72637 +                                   GNTMAP_host_map, handle);
72638 +
72639 +               if (unlikely
72640 +                   (HYPERVISOR_grant_table_op
72641 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
72642 +                       BUG();
72643 +               }
72644 +
72645 +               offset += tocopy;
72646 +               i++;
72647 +       }
72648 +
72649 +       rc = offset;
72650 +       DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
72651 +       notify_remote_via_irq(tpmif->irq);
72652 +
72653 +       return rc;
72654 +}
72655 +
72656 +/*
72657 + * Read data from the shared memory and copy it directly into the
72658 + * provided buffer. Advance the read_last indicator which tells
72659 + * how many bytes have already been read.
72660 + */
72661 +static int packet_read(struct packet *pak, size_t numbytes,
72662 +                      char *buffer, size_t buffersize, int isuserbuffer)
72663 +{
72664 +       tpmif_t *tpmif = pak->tpmif;
72665 +
72666 +       /*
72667 +        * Read 'numbytes' of data from the buffer. The first 4
72668 +        * bytes are the instance number in network byte order,
72669 +        * after that come the data from the shared memory buffer.
72670 +        */
72671 +       u32 to_copy;
72672 +       u32 offset = 0;
72673 +       u32 room_left = buffersize;
72674 +
72675 +       if (pak->last_read < 4) {
72676 +               /*
72677 +                * copy the instance number into the buffer
72678 +                */
72679 +               u32 instance_no = htonl(pak->tpm_instance);
72680 +               u32 last_read = pak->last_read;
72681 +
72682 +               to_copy = min_t(size_t, 4 - last_read, numbytes);
72683 +
72684 +               if (copy_to_buffer(&buffer[0],
72685 +                                  &(((u8 *) & instance_no)[last_read]),
72686 +                                  to_copy, isuserbuffer)) {
72687 +                       return -EFAULT;
72688 +               }
72689 +
72690 +               pak->last_read += to_copy;
72691 +               offset += to_copy;
72692 +               room_left -= to_copy;
72693 +       }
72694 +
72695 +       /*
72696 +        * If the packet has a data buffer appended, read from it...
72697 +        */
72698 +
72699 +       if (room_left > 0) {
72700 +               if (pak->data_buffer) {
72701 +                       u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
72702 +                       u32 last_read = pak->last_read - 4;
72703 +
72704 +                       if (copy_to_buffer(&buffer[offset],
72705 +                                          &pak->data_buffer[last_read],
72706 +                                          to_copy, isuserbuffer)) {
72707 +                               return -EFAULT;
72708 +                       }
72709 +                       pak->last_read += to_copy;
72710 +                       offset += to_copy;
72711 +               } else {
72712 +                       offset = packet_read_shmem(pak,
72713 +                                                  tpmif,
72714 +                                                  offset,
72715 +                                                  buffer,
72716 +                                                  isuserbuffer, room_left);
72717 +               }
72718 +       }
72719 +       return offset;
72720 +}
72721 +
72722 +static int packet_read_shmem(struct packet *pak,
72723 +                            tpmif_t * tpmif,
72724 +                            u32 offset, char *buffer, int isuserbuffer,
72725 +                            u32 room_left)
72726 +{
72727 +       u32 last_read = pak->last_read - 4;
72728 +       u32 i = (last_read / PAGE_SIZE);
72729 +       u32 pg_offset = last_read & (PAGE_SIZE - 1);
72730 +       u32 to_copy;
72731 +       grant_handle_t handle;
72732 +
72733 +       tpmif_tx_request_t *tx;
72734 +
72735 +       tx = &tpmif->tx->ring[0].req;
72736 +       /*
72737 +        * Start copying data at the page with index 'index'
72738 +        * and within that page at offset 'offset'.
72739 +        * Copy a maximum of 'room_left' bytes.
72740 +        */
72741 +       to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
72742 +       while (to_copy > 0) {
72743 +               void *src;
72744 +               struct gnttab_map_grant_ref map_op;
72745 +               struct gnttab_unmap_grant_ref unmap_op;
72746 +
72747 +               tx = &tpmif->tx->ring[i].req;
72748 +
72749 +               gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
72750 +                                 GNTMAP_host_map, tx->ref, tpmif->domid);
72751 +
72752 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
72753 +                                                      &map_op, 1))) {
72754 +                       BUG();
72755 +               }
72756 +
72757 +               if (map_op.status) {
72758 +                       DPRINTK(" Grant table operation failure !\n");
72759 +                       return -EFAULT;
72760 +               }
72761 +
72762 +               handle = map_op.handle;
72763 +
72764 +               if (to_copy > tx->size) {
72765 +                       /*
72766 +                        * User requests more than what's available
72767 +                        */
72768 +                       to_copy = min_t(u32, tx->size, to_copy);
72769 +               }
72770 +
72771 +               DPRINTK("Copying from mapped memory at %08lx\n",
72772 +                       (unsigned long)(idx_to_kaddr(tpmif, i) |
72773 +                                       (tx->addr & ~PAGE_MASK)));
72774 +
72775 +               src = (void *)(idx_to_kaddr(tpmif, i) |
72776 +                              ((tx->addr & ~PAGE_MASK) + pg_offset));
72777 +               if (copy_to_buffer(&buffer[offset],
72778 +                                  src, to_copy, isuserbuffer)) {
72779 +                       return -EFAULT;
72780 +               }
72781 +
72782 +               DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
72783 +                       tpmif->domid, buffer[offset], buffer[offset + 1],
72784 +                       buffer[offset + 2], buffer[offset + 3]);
72785 +
72786 +               gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
72787 +                                   GNTMAP_host_map, handle);
72788 +
72789 +               if (unlikely
72790 +                   (HYPERVISOR_grant_table_op
72791 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
72792 +                       BUG();
72793 +               }
72794 +
72795 +               offset += to_copy;
72796 +               pg_offset = 0;
72797 +               last_read += to_copy;
72798 +               room_left -= to_copy;
72799 +
72800 +               to_copy = min_t(u32, PAGE_SIZE, room_left);
72801 +               i++;
72802 +       }                       /* while (to_copy > 0) */
72803 +       /*
72804 +        * Adjust the last_read pointer
72805 +        */
72806 +       pak->last_read = last_read + 4;
72807 +       return offset;
72808 +}
72809 +
72810 +/* ============================================================
72811 + * The file layer for reading data from this device
72812 + * ============================================================
72813 + */
72814 +static int vtpm_op_open(struct inode *inode, struct file *f)
72815 +{
72816 +       int rc = 0;
72817 +       unsigned long flags;
72818 +
72819 +       write_lock_irqsave(&dataex.pak_lock, flags);
72820 +       if (dataex.has_opener == 0) {
72821 +               dataex.has_opener = 1;
72822 +       } else {
72823 +               rc = -EPERM;
72824 +       }
72825 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
72826 +       return rc;
72827 +}
72828 +
72829 +static ssize_t vtpm_op_read(struct file *file,
72830 +                           char __user * data, size_t size, loff_t * offset)
72831 +{
72832 +       int ret_size = -ENODATA;
72833 +       struct packet *pak = NULL;
72834 +       unsigned long flags;
72835 +
72836 +       write_lock_irqsave(&dataex.pak_lock, flags);
72837 +       if (dataex.aborted) {
72838 +               dataex.aborted = 0;
72839 +               dataex.copied_so_far = 0;
72840 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
72841 +               return -EIO;
72842 +       }
72843 +
72844 +       if (list_empty(&dataex.pending_pak)) {
72845 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
72846 +               wait_event_interruptible(dataex.wait_queue,
72847 +                                        !list_empty(&dataex.pending_pak));
72848 +               write_lock_irqsave(&dataex.pak_lock, flags);
72849 +               dataex.copied_so_far = 0;
72850 +       }
72851 +
72852 +       if (!list_empty(&dataex.pending_pak)) {
72853 +               unsigned int left;
72854 +
72855 +               pak = list_entry(dataex.pending_pak.next, struct packet, next);
72856 +               left = pak->data_len - dataex.copied_so_far;
72857 +               list_del(&pak->next);
72858 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
72859 +
72860 +               DPRINTK("size given by app: %d, available: %d\n", size, left);
72861 +
72862 +               ret_size = min_t(size_t, size, left);
72863 +
72864 +               ret_size = packet_read(pak, ret_size, data, size, 1);
72865 +
72866 +               write_lock_irqsave(&dataex.pak_lock, flags);
72867 +
72868 +               if (ret_size < 0) {
72869 +                       del_singleshot_timer_sync(&pak->processing_timer);
72870 +                       packet_free(pak);
72871 +                       dataex.copied_so_far = 0;
72872 +               } else {
72873 +                       DPRINTK("Copied %d bytes to user buffer\n", ret_size);
72874 +
72875 +                       dataex.copied_so_far += ret_size;
72876 +                       if (dataex.copied_so_far >= pak->data_len + 4) {
72877 +                               DPRINTK("All data from this packet given to app.\n");
72878 +                               /* All data given to app */
72879 +
72880 +                               del_singleshot_timer_sync(&pak->
72881 +                                                         processing_timer);
72882 +                               list_add_tail(&pak->next, &dataex.current_pak);
72883 +                               /*
72884 +                                * The more fontends that are handled at the same time,
72885 +                                * the more time we give the TPM to process the request.
72886 +                                */
72887 +                               mod_timer(&pak->processing_timer,
72888 +                                         jiffies + (num_frontends * 60 * HZ));
72889 +                               dataex.copied_so_far = 0;
72890 +                       } else {
72891 +                               list_add(&pak->next, &dataex.pending_pak);
72892 +                       }
72893 +               }
72894 +       }
72895 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
72896 +
72897 +       DPRINTK("Returning result from read to app: %d\n", ret_size);
72898 +
72899 +       return ret_size;
72900 +}
72901 +
72902 +/*
72903 + * Write operation - only works after a previous read operation!
72904 + */
72905 +static ssize_t vtpm_op_write(struct file *file,
72906 +                            const char __user * data, size_t size,
72907 +                            loff_t * offset)
72908 +{
72909 +       struct packet *pak;
72910 +       int rc = 0;
72911 +       unsigned int off = 4;
72912 +       unsigned long flags;
72913 +       struct vtpm_resp_hdr vrh;
72914 +
72915 +       /*
72916 +        * Minimum required packet size is:
72917 +        * 4 bytes for instance number
72918 +        * 2 bytes for tag
72919 +        * 4 bytes for paramSize
72920 +        * 4 bytes for the ordinal
72921 +        * sum: 14 bytes
72922 +        */
72923 +       if (size < sizeof (vrh))
72924 +               return -EFAULT;
72925 +
72926 +       if (copy_from_user(&vrh, data, sizeof (vrh)))
72927 +               return -EFAULT;
72928 +
72929 +       /* malformed packet? */
72930 +       if ((off + ntohl(vrh.len_no)) != size)
72931 +               return -EFAULT;
72932 +
72933 +       write_lock_irqsave(&dataex.pak_lock, flags);
72934 +       pak = packet_find_instance(&dataex.current_pak,
72935 +                                  ntohl(vrh.instance_no));
72936 +
72937 +       if (pak == NULL) {
72938 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
72939 +               DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
72940 +                       ntohl(vrh.instance_no));
72941 +               return -EFAULT;
72942 +       }
72943 +
72944 +       del_singleshot_timer_sync(&pak->processing_timer);
72945 +       list_del(&pak->next);
72946 +
72947 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
72948 +
72949 +       /*
72950 +        * The first 'offset' bytes must be the instance number - skip them.
72951 +        */
72952 +       size -= off;
72953 +
72954 +       rc = packet_write(pak, &data[off], size, 1);
72955 +
72956 +       if (rc > 0) {
72957 +               /* I neglected the first 4 bytes */
72958 +               rc += off;
72959 +       }
72960 +       packet_free(pak);
72961 +       return rc;
72962 +}
72963 +
72964 +static int vtpm_op_release(struct inode *inode, struct file *file)
72965 +{
72966 +       unsigned long flags;
72967 +
72968 +       vtpm_release_packets(NULL, 1);
72969 +       write_lock_irqsave(&dataex.pak_lock, flags);
72970 +       dataex.has_opener = 0;
72971 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
72972 +       return 0;
72973 +}
72974 +
72975 +static unsigned int vtpm_op_poll(struct file *file,
72976 +                                struct poll_table_struct *pts)
72977 +{
72978 +       unsigned int flags = POLLOUT | POLLWRNORM;
72979 +
72980 +       poll_wait(file, &dataex.wait_queue, pts);
72981 +       if (!list_empty(&dataex.pending_pak)) {
72982 +               flags |= POLLIN | POLLRDNORM;
72983 +       }
72984 +       return flags;
72985 +}
72986 +
72987 +static struct file_operations vtpm_ops = {
72988 +       .owner = THIS_MODULE,
72989 +       .llseek = no_llseek,
72990 +       .open = vtpm_op_open,
72991 +       .read = vtpm_op_read,
72992 +       .write = vtpm_op_write,
72993 +       .release = vtpm_op_release,
72994 +       .poll = vtpm_op_poll,
72995 +};
72996 +
72997 +static struct miscdevice vtpms_miscdevice = {
72998 +       .minor = 225,
72999 +       .name = "vtpm",
73000 +       .fops = &vtpm_ops,
73001 +};
73002 +
73003 +/***************************************************************
73004 + Utility functions
73005 +***************************************************************/
73006 +
73007 +static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
73008 +{
73009 +       int rc;
73010 +       static const unsigned char tpm_error_message_fail[] = {
73011 +               0x00, 0x00,
73012 +               0x00, 0x00, 0x00, 0x0a,
73013 +               0x00, 0x00, 0x00, 0x09  /* TPM_FAIL */
73014 +       };
73015 +       unsigned char buffer[sizeof (tpm_error_message_fail)];
73016 +
73017 +       memcpy(buffer, tpm_error_message_fail,
73018 +              sizeof (tpm_error_message_fail));
73019 +       /*
73020 +        * Insert the right response tag depending on the given tag
73021 +        * All response tags are '+3' to the request tag.
73022 +        */
73023 +       buffer[1] = req_tag + 3;
73024 +
73025 +       /*
73026 +        * Write the data to shared memory and notify the front-end
73027 +        */
73028 +       rc = packet_write(pak, buffer, sizeof (buffer), 0);
73029 +
73030 +       return rc;
73031 +}
73032 +
73033 +static int _vtpm_release_packets(struct list_head *head,
73034 +                                tpmif_t * tpmif, int send_msgs)
73035 +{
73036 +       int aborted = 0;
73037 +       int c = 0;
73038 +       struct packet *pak;
73039 +       struct list_head *pos, *tmp;
73040 +
73041 +       list_for_each_safe(pos, tmp, head) {
73042 +               pak = list_entry(pos, struct packet, next);
73043 +               c += 1;
73044 +
73045 +               if (tpmif == NULL || pak->tpmif == tpmif) {
73046 +                       int can_send = 0;
73047 +
73048 +                       del_singleshot_timer_sync(&pak->processing_timer);
73049 +                       list_del(&pak->next);
73050 +
73051 +                       if (pak->tpmif && pak->tpmif->status == CONNECTED) {
73052 +                               can_send = 1;
73053 +                       }
73054 +
73055 +                       if (send_msgs && can_send) {
73056 +                               tpm_send_fail_message(pak, pak->req_tag);
73057 +                       }
73058 +                       packet_free(pak);
73059 +                       if (c == 1)
73060 +                               aborted = 1;
73061 +               }
73062 +       }
73063 +       return aborted;
73064 +}
73065 +
73066 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
73067 +{
73068 +       unsigned long flags;
73069 +
73070 +       write_lock_irqsave(&dataex.pak_lock, flags);
73071 +
73072 +       dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
73073 +                                              tpmif,
73074 +                                              send_msgs);
73075 +       _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
73076 +
73077 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
73078 +       return 0;
73079 +}
73080 +
73081 +static int vtpm_queue_packet(struct packet *pak)
73082 +{
73083 +       int rc = 0;
73084 +
73085 +       if (dataex.has_opener) {
73086 +               unsigned long flags;
73087 +
73088 +               write_lock_irqsave(&dataex.pak_lock, flags);
73089 +               list_add_tail(&pak->next, &dataex.pending_pak);
73090 +               /* give the TPM some time to pick up the request */
73091 +               mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
73092 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
73093 +
73094 +               wake_up_interruptible(&dataex.wait_queue);
73095 +       } else {
73096 +               rc = -EFAULT;
73097 +       }
73098 +       return rc;
73099 +}
73100 +
73101 +static int vtpm_receive(tpmif_t * tpmif, u32 size)
73102 +{
73103 +       int rc = 0;
73104 +       unsigned char buffer[10];
73105 +       __be32 *native_size;
73106 +       struct packet *pak = packet_alloc(tpmif, size, 0, 0);
73107 +
73108 +       if (!pak)
73109 +               return -ENOMEM;
73110 +       /*
73111 +        * Read 10 bytes from the received buffer to test its
73112 +        * content for validity.
73113 +        */
73114 +       if (sizeof (buffer) != packet_read(pak,
73115 +                                          sizeof (buffer), buffer,
73116 +                                          sizeof (buffer), 0)) {
73117 +               goto failexit;
73118 +       }
73119 +       /*
73120 +        * Reset the packet read pointer so we can read all its
73121 +        * contents again.
73122 +        */
73123 +       packet_reset(pak);
73124 +
73125 +       native_size = (__force __be32 *) (&buffer[4 + 2]);
73126 +       /*
73127 +        * Verify that the size of the packet is correct
73128 +        * as indicated and that there's actually someone reading packets.
73129 +        * The minimum size of the packet is '10' for tag, size indicator
73130 +        * and ordinal.
73131 +        */
73132 +       if (size < 10 ||
73133 +           be32_to_cpu(*native_size) != size ||
73134 +           0 == dataex.has_opener || tpmif->status != CONNECTED) {
73135 +               rc = -EINVAL;
73136 +               goto failexit;
73137 +       } else {
73138 +               rc = vtpm_queue_packet(pak);
73139 +               if (rc < 0)
73140 +                       goto failexit;
73141 +       }
73142 +       return 0;
73143 +
73144 +      failexit:
73145 +       if (pak) {
73146 +               tpm_send_fail_message(pak, buffer[4 + 1]);
73147 +               packet_free(pak);
73148 +       }
73149 +       return rc;
73150 +}
73151 +
73152 +/*
73153 + * Timeout function that gets invoked when a packet has not been processed
73154 + * during the timeout period.
73155 + * The packet must be on a list when this function is invoked. This
73156 + * also means that once its taken off a list, the timer must be
73157 + * destroyed as well.
73158 + */
73159 +static void processing_timeout(unsigned long ptr)
73160 +{
73161 +       struct packet *pak = (struct packet *)ptr;
73162 +       unsigned long flags;
73163 +
73164 +       write_lock_irqsave(&dataex.pak_lock, flags);
73165 +       /*
73166 +        * The packet needs to be searched whether it
73167 +        * is still on the list.
73168 +        */
73169 +       if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
73170 +           pak == packet_find_packet(&dataex.current_pak, pak)) {
73171 +               if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
73172 +                       tpm_send_fail_message(pak, pak->req_tag);
73173 +               }
73174 +               /* discard future responses */
73175 +               pak->flags |= PACKET_FLAG_DISCARD_RESPONSE;
73176 +       }
73177 +
73178 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
73179 +}
73180 +
73181 +static void tpm_tx_action(unsigned long unused);
73182 +static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
73183 +
73184 +static struct list_head tpm_schedule_list;
73185 +static spinlock_t tpm_schedule_list_lock;
73186 +
73187 +static inline void maybe_schedule_tx_action(void)
73188 +{
73189 +       smp_mb();
73190 +       tasklet_schedule(&tpm_tx_tasklet);
73191 +}
73192 +
73193 +static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
73194 +{
73195 +       return tpmif->list.next != NULL;
73196 +}
73197 +
73198 +static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
73199 +{
73200 +       spin_lock_irq(&tpm_schedule_list_lock);
73201 +       if (likely(__on_tpm_schedule_list(tpmif))) {
73202 +               list_del(&tpmif->list);
73203 +               tpmif->list.next = NULL;
73204 +               tpmif_put(tpmif);
73205 +       }
73206 +       spin_unlock_irq(&tpm_schedule_list_lock);
73207 +}
73208 +
73209 +static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
73210 +{
73211 +       if (__on_tpm_schedule_list(tpmif))
73212 +               return;
73213 +
73214 +       spin_lock_irq(&tpm_schedule_list_lock);
73215 +       if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
73216 +               list_add_tail(&tpmif->list, &tpm_schedule_list);
73217 +               tpmif_get(tpmif);
73218 +       }
73219 +       spin_unlock_irq(&tpm_schedule_list_lock);
73220 +}
73221 +
73222 +void tpmif_schedule_work(tpmif_t * tpmif)
73223 +{
73224 +       add_to_tpm_schedule_list_tail(tpmif);
73225 +       maybe_schedule_tx_action();
73226 +}
73227 +
73228 +void tpmif_deschedule_work(tpmif_t * tpmif)
73229 +{
73230 +       remove_from_tpm_schedule_list(tpmif);
73231 +}
73232 +
73233 +static void tpm_tx_action(unsigned long unused)
73234 +{
73235 +       struct list_head *ent;
73236 +       tpmif_t *tpmif;
73237 +       tpmif_tx_request_t *tx;
73238 +
73239 +       DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
73240 +
73241 +       while (!list_empty(&tpm_schedule_list)) {
73242 +               /* Get a tpmif from the list with work to do. */
73243 +               ent = tpm_schedule_list.next;
73244 +               tpmif = list_entry(ent, tpmif_t, list);
73245 +               tpmif_get(tpmif);
73246 +               remove_from_tpm_schedule_list(tpmif);
73247 +
73248 +               tx = &tpmif->tx->ring[0].req;
73249 +
73250 +               /* pass it up */
73251 +               vtpm_receive(tpmif, tx->size);
73252 +
73253 +               tpmif_put(tpmif);
73254 +       }
73255 +}
73256 +
73257 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
73258 +{
73259 +       tpmif_t *tpmif = (tpmif_t *) dev_id;
73260 +
73261 +       add_to_tpm_schedule_list_tail(tpmif);
73262 +       maybe_schedule_tx_action();
73263 +       return IRQ_HANDLED;
73264 +}
73265 +
73266 +static int __init tpmback_init(void)
73267 +{
73268 +       int rc;
73269 +
73270 +       if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
73271 +               printk(KERN_ALERT
73272 +                      "Could not register misc device for TPM BE.\n");
73273 +               return rc;
73274 +       }
73275 +
73276 +       dataex_init(&dataex);
73277 +
73278 +       spin_lock_init(&tpm_schedule_list_lock);
73279 +       INIT_LIST_HEAD(&tpm_schedule_list);
73280 +
73281 +       tpmif_interface_init();
73282 +       tpmif_xenbus_init();
73283 +
73284 +       printk(KERN_ALERT "Successfully initialized TPM backend driver.\n");
73285 +
73286 +       return 0;
73287 +}
73288 +
73289 +module_init(tpmback_init);
73290 +
73291 +void __exit tpmback_exit(void)
73292 +{
73293 +       vtpm_release_packets(NULL, 0);
73294 +       tpmif_xenbus_exit();
73295 +       tpmif_interface_exit();
73296 +       misc_deregister(&vtpms_miscdevice);
73297 +}
73298 +
73299 +MODULE_LICENSE("Dual BSD/GPL");
73300 diff -ruNp linux-2.6.19/drivers/xen/tpmback/xenbus.c linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/xenbus.c
73301 --- linux-2.6.19/drivers/xen/tpmback/xenbus.c   1970-01-01 00:00:00.000000000 +0000
73302 +++ linux-2.6.19-xen-3.0.4/drivers/xen/tpmback/xenbus.c 2007-02-02 19:10:45.000000000 +0000
73303 @@ -0,0 +1,289 @@
73304 +/*  Xenbus code for tpmif backend
73305 +    Copyright (C) 2005 IBM Corporation
73306 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
73307 +
73308 +    This program is free software; you can redistribute it and/or modify
73309 +    it under the terms of the GNU General Public License as published by
73310 +    the Free Software Foundation; either version 2 of the License, or
73311 +    (at your option) any later version.
73312 +
73313 +    This program is distributed in the hope that it will be useful,
73314 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
73315 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
73316 +    GNU General Public License for more details.
73317 +
73318 +    You should have received a copy of the GNU General Public License
73319 +    along with this program; if not, write to the Free Software
73320 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
73321 +*/
73322 +#include <stdarg.h>
73323 +#include <linux/module.h>
73324 +#include <xen/xenbus.h>
73325 +#include "common.h"
73326 +
73327 +struct backend_info
73328 +{
73329 +       struct xenbus_device *dev;
73330 +
73331 +       /* our communications channel */
73332 +       tpmif_t *tpmif;
73333 +
73334 +       long int frontend_id;
73335 +       long int instance; // instance of TPM
73336 +       u8 is_instance_set;// whether instance number has been set
73337 +
73338 +       /* watch front end for changes */
73339 +       struct xenbus_watch backend_watch;
73340 +};
73341 +
73342 +static void maybe_connect(struct backend_info *be);
73343 +static void connect(struct backend_info *be);
73344 +static int connect_ring(struct backend_info *be);
73345 +static void backend_changed(struct xenbus_watch *watch,
73346 +                           const char **vec, unsigned int len);
73347 +static void frontend_changed(struct xenbus_device *dev,
73348 +                            enum xenbus_state frontend_state);
73349 +
73350 +long int tpmback_get_instance(struct backend_info *bi)
73351 +{
73352 +       long int res = -1;
73353 +       if (bi && bi->is_instance_set)
73354 +               res = bi->instance;
73355 +       return res;
73356 +}
73357 +
73358 +static int tpmback_remove(struct xenbus_device *dev)
73359 +{
73360 +       struct backend_info *be = dev->dev.driver_data;
73361 +
73362 +       if (!be) return 0;
73363 +
73364 +       if (be->backend_watch.node) {
73365 +               unregister_xenbus_watch(&be->backend_watch);
73366 +               kfree(be->backend_watch.node);
73367 +               be->backend_watch.node = NULL;
73368 +       }
73369 +       if (be->tpmif) {
73370 +               be->tpmif->bi = NULL;
73371 +               vtpm_release_packets(be->tpmif, 0);
73372 +               tpmif_put(be->tpmif);
73373 +               be->tpmif = NULL;
73374 +       }
73375 +       kfree(be);
73376 +       dev->dev.driver_data = NULL;
73377 +       return 0;
73378 +}
73379 +
73380 +static int tpmback_probe(struct xenbus_device *dev,
73381 +                        const struct xenbus_device_id *id)
73382 +{
73383 +       int err;
73384 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
73385 +                                         GFP_KERNEL);
73386 +
73387 +       if (!be) {
73388 +               xenbus_dev_fatal(dev, -ENOMEM,
73389 +                                "allocating backend structure");
73390 +               return -ENOMEM;
73391 +       }
73392 +
73393 +       be->is_instance_set = 0;
73394 +       be->dev = dev;
73395 +       dev->dev.driver_data = be;
73396 +
73397 +       err = xenbus_watch_path2(dev, dev->nodename,
73398 +                                "instance", &be->backend_watch,
73399 +                                backend_changed);
73400 +       if (err) {
73401 +               goto fail;
73402 +       }
73403 +
73404 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
73405 +       if (err) {
73406 +               goto fail;
73407 +       }
73408 +       return 0;
73409 +fail:
73410 +       tpmback_remove(dev);
73411 +       return err;
73412 +}
73413 +
73414 +
73415 +static void backend_changed(struct xenbus_watch *watch,
73416 +                           const char **vec, unsigned int len)
73417 +{
73418 +       int err;
73419 +       long instance;
73420 +       struct backend_info *be
73421 +               = container_of(watch, struct backend_info, backend_watch);
73422 +       struct xenbus_device *dev = be->dev;
73423 +
73424 +       err = xenbus_scanf(XBT_NIL, dev->nodename,
73425 +                          "instance","%li", &instance);
73426 +       if (XENBUS_EXIST_ERR(err)) {
73427 +               return;
73428 +       }
73429 +
73430 +       if (err != 1) {
73431 +               xenbus_dev_fatal(dev, err, "reading instance");
73432 +               return;
73433 +       }
73434 +
73435 +       if (be->is_instance_set == 0) {
73436 +               be->instance = instance;
73437 +               be->is_instance_set = 1;
73438 +       }
73439 +}
73440 +
73441 +
73442 +static void frontend_changed(struct xenbus_device *dev,
73443 +                            enum xenbus_state frontend_state)
73444 +{
73445 +       struct backend_info *be = dev->dev.driver_data;
73446 +       int err;
73447 +
73448 +       switch (frontend_state) {
73449 +       case XenbusStateInitialising:
73450 +       case XenbusStateInitialised:
73451 +               break;
73452 +
73453 +       case XenbusStateConnected:
73454 +               err = connect_ring(be);
73455 +               if (err) {
73456 +                       return;
73457 +               }
73458 +               maybe_connect(be);
73459 +               break;
73460 +
73461 +       case XenbusStateClosing:
73462 +               be->instance = -1;
73463 +               xenbus_switch_state(dev, XenbusStateClosing);
73464 +               break;
73465 +
73466 +       case XenbusStateUnknown: /* keep it here */
73467 +       case XenbusStateClosed:
73468 +               xenbus_switch_state(dev, XenbusStateClosed);
73469 +               device_unregister(&be->dev->dev);
73470 +               tpmback_remove(dev);
73471 +               break;
73472 +
73473 +       default:
73474 +               xenbus_dev_fatal(dev, -EINVAL,
73475 +                                "saw state %d at frontend",
73476 +                                frontend_state);
73477 +               break;
73478 +       }
73479 +}
73480 +
73481 +
73482 +
73483 +static void maybe_connect(struct backend_info *be)
73484 +{
73485 +       if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
73486 +               return;
73487 +
73488 +       connect(be);
73489 +}
73490 +
73491 +
73492 +static void connect(struct backend_info *be)
73493 +{
73494 +       struct xenbus_transaction xbt;
73495 +       int err;
73496 +       struct xenbus_device *dev = be->dev;
73497 +       unsigned long ready = 1;
73498 +
73499 +again:
73500 +       err = xenbus_transaction_start(&xbt);
73501 +       if (err) {
73502 +               xenbus_dev_fatal(be->dev, err, "starting transaction");
73503 +               return;
73504 +       }
73505 +
73506 +       err = xenbus_printf(xbt, be->dev->nodename,
73507 +                           "ready", "%lu", ready);
73508 +       if (err) {
73509 +               xenbus_dev_fatal(be->dev, err, "writing 'ready'");
73510 +               goto abort;
73511 +       }
73512 +
73513 +       err = xenbus_transaction_end(xbt, 0);
73514 +       if (err == -EAGAIN)
73515 +               goto again;
73516 +       if (err)
73517 +               xenbus_dev_fatal(be->dev, err, "end of transaction");
73518 +
73519 +       err = xenbus_switch_state(dev, XenbusStateConnected);
73520 +       if (!err)
73521 +               be->tpmif->status = CONNECTED;
73522 +       return;
73523 +abort:
73524 +       xenbus_transaction_end(xbt, 1);
73525 +}
73526 +
73527 +
73528 +static int connect_ring(struct backend_info *be)
73529 +{
73530 +       struct xenbus_device *dev = be->dev;
73531 +       unsigned long ring_ref;
73532 +       unsigned int evtchn;
73533 +       int err;
73534 +
73535 +       err = xenbus_gather(XBT_NIL, dev->otherend,
73536 +                           "ring-ref", "%lu", &ring_ref,
73537 +                           "event-channel", "%u", &evtchn, NULL);
73538 +       if (err) {
73539 +               xenbus_dev_error(dev, err,
73540 +                                "reading %s/ring-ref and event-channel",
73541 +                                dev->otherend);
73542 +               return err;
73543 +       }
73544 +
73545 +       if (!be->tpmif) {
73546 +               be->tpmif = tpmif_find(dev->otherend_id, be);
73547 +               if (IS_ERR(be->tpmif)) {
73548 +                       err = PTR_ERR(be->tpmif);
73549 +                       be->tpmif = NULL;
73550 +                       xenbus_dev_fatal(dev,err,"creating vtpm interface");
73551 +                       return err;
73552 +               }
73553 +       }
73554 +
73555 +       if (be->tpmif != NULL) {
73556 +               err = tpmif_map(be->tpmif, ring_ref, evtchn);
73557 +               if (err) {
73558 +                       xenbus_dev_error(dev, err,
73559 +                                        "mapping shared-frame %lu port %u",
73560 +                                        ring_ref, evtchn);
73561 +                       return err;
73562 +               }
73563 +       }
73564 +       return 0;
73565 +}
73566 +
73567 +
73568 +static struct xenbus_device_id tpmback_ids[] = {
73569 +       { "vtpm" },
73570 +       { "" }
73571 +};
73572 +
73573 +
73574 +static struct xenbus_driver tpmback = {
73575 +       .name = "vtpm",
73576 +       .owner = THIS_MODULE,
73577 +       .ids = tpmback_ids,
73578 +       .probe = tpmback_probe,
73579 +       .remove = tpmback_remove,
73580 +       .otherend_changed = frontend_changed,
73581 +};
73582 +
73583 +
73584 +void tpmif_xenbus_init(void)
73585 +{
73586 +       xenbus_register_backend(&tpmback);
73587 +}
73588 +
73589 +void tpmif_xenbus_exit(void)
73590 +{
73591 +       xenbus_unregister_driver(&tpmback);
73592 +}
73593 diff -ruNp linux-2.6.19/drivers/xen/util.c linux-2.6.19-xen-3.0.4/drivers/xen/util.c
73594 --- linux-2.6.19/drivers/xen/util.c     1970-01-01 00:00:00.000000000 +0000
73595 +++ linux-2.6.19-xen-3.0.4/drivers/xen/util.c   2007-02-02 19:10:45.000000000 +0000
73596 @@ -0,0 +1,70 @@
73597 +
73598 +#include <linux/mm.h>
73599 +#include <linux/module.h>
73600 +#include <linux/slab.h>
73601 +#include <linux/vmalloc.h>
73602 +#include <asm/uaccess.h>
73603 +#include <xen/driver_util.h>
73604 +
73605 +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
73606 +{
73607 +       /* apply_to_page_range() does all the hard work. */
73608 +       return 0;
73609 +}
73610 +
73611 +struct vm_struct *alloc_vm_area(unsigned long size)
73612 +{
73613 +       struct vm_struct *area;
73614 +
73615 +       area = get_vm_area(size, VM_IOREMAP);
73616 +       if (area == NULL)
73617 +               return NULL;
73618 +
73619 +       /*
73620 +        * This ensures that page tables are constructed for this region
73621 +        * of kernel virtual address space and mapped into init_mm.
73622 +        */
73623 +       if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
73624 +                               area->size, f, NULL)) {
73625 +               free_vm_area(area);
73626 +               return NULL;
73627 +       }
73628 +
73629 +       return area;
73630 +}
73631 +EXPORT_SYMBOL_GPL(alloc_vm_area);
73632 +
73633 +void free_vm_area(struct vm_struct *area)
73634 +{
73635 +       struct vm_struct *ret;
73636 +       ret = remove_vm_area(area->addr);
73637 +       BUG_ON(ret != area);
73638 +       kfree(area);
73639 +}
73640 +EXPORT_SYMBOL_GPL(free_vm_area);
73641 +
73642 +void lock_vm_area(struct vm_struct *area)
73643 +{
73644 +       unsigned long i;
73645 +       char c;
73646 +
73647 +       /*
73648 +        * Prevent context switch to a lazy mm that doesn't have this area
73649 +        * mapped into its page tables.
73650 +        */
73651 +       preempt_disable();
73652 +
73653 +       /*
73654 +        * Ensure that the page tables are mapped into the current mm. The
73655 +        * page-fault path will copy the page directory pointers from init_mm.
73656 +        */
73657 +       for (i = 0; i < area->size; i += PAGE_SIZE)
73658 +               (void)__get_user(c, (char __user *)area->addr + i);
73659 +}
73660 +EXPORT_SYMBOL_GPL(lock_vm_area);
73661 +
73662 +void unlock_vm_area(struct vm_struct *area)
73663 +{
73664 +       preempt_enable();
73665 +}
73666 +EXPORT_SYMBOL_GPL(unlock_vm_area);
73667 diff -ruNp linux-2.6.19/drivers/xen/xenbus/Makefile linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/Makefile
73668 --- linux-2.6.19/drivers/xen/xenbus/Makefile    1970-01-01 00:00:00.000000000 +0000
73669 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/Makefile  2007-02-02 19:10:45.000000000 +0000
73670 @@ -0,0 +1,9 @@
73671 +obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
73672 +obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o
73673 +
73674 +xenbus_be-objs =
73675 +xenbus_be-objs += xenbus_backend_client.o
73676 +
73677 +xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
73678 +obj-y += $(xenbus-y) $(xenbus-m)
73679 +obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
73680 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_backend_client.c linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_backend_client.c
73681 --- linux-2.6.19/drivers/xen/xenbus/xenbus_backend_client.c     1970-01-01 00:00:00.000000000 +0000
73682 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_backend_client.c   2007-02-02 19:10:45.000000000 +0000
73683 @@ -0,0 +1,147 @@
73684 +/******************************************************************************
73685 + * Backend-client-facing interface for the Xenbus driver.  In other words, the
73686 + * interface between the Xenbus and the device-specific code in the backend
73687 + * driver.
73688 + *
73689 + * Copyright (C) 2005-2006 XenSource Ltd
73690 + * 
73691 + * This program is free software; you can redistribute it and/or
73692 + * modify it under the terms of the GNU General Public License version 2
73693 + * as published by the Free Software Foundation; or, when distributed
73694 + * separately from the Linux kernel or incorporated into other
73695 + * software packages, subject to the following license:
73696 + * 
73697 + * Permission is hereby granted, free of charge, to any person obtaining a copy
73698 + * of this source file (the "Software"), to deal in the Software without
73699 + * restriction, including without limitation the rights to use, copy, modify,
73700 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
73701 + * and to permit persons to whom the Software is furnished to do so, subject to
73702 + * the following conditions:
73703 + * 
73704 + * The above copyright notice and this permission notice shall be included in
73705 + * all copies or substantial portions of the Software.
73706 + * 
73707 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
73708 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
73709 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
73710 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
73711 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
73712 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
73713 + * IN THE SOFTWARE.
73714 + */
73715 +
73716 +#include <linux/err.h>
73717 +#include <xen/gnttab.h>
73718 +#include <xen/xenbus.h>
73719 +#include <xen/driver_util.h>
73720 +
73721 +/* Based on Rusty Russell's skeleton driver's map_page */
73722 +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref)
73723 +{
73724 +       struct gnttab_map_grant_ref op;
73725 +       struct vm_struct *area;
73726 +
73727 +       area = alloc_vm_area(PAGE_SIZE);
73728 +       if (!area)
73729 +               return ERR_PTR(-ENOMEM);
73730 +
73731 +       gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
73732 +                         gnt_ref, dev->otherend_id);
73733 +       
73734 +       lock_vm_area(area);
73735 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
73736 +       unlock_vm_area(area);
73737 +
73738 +       if (op.status != GNTST_okay) {
73739 +               free_vm_area(area);
73740 +               xenbus_dev_fatal(dev, op.status,
73741 +                                "mapping in shared page %d from domain %d",
73742 +                                gnt_ref, dev->otherend_id);
73743 +               BUG_ON(!IS_ERR(ERR_PTR(op.status)));
73744 +               return ERR_PTR(op.status);
73745 +       }
73746 +
73747 +       /* Stuff the handle in an unused field */
73748 +       area->phys_addr = (unsigned long)op.handle;
73749 +
73750 +       return area;
73751 +}
73752 +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
73753 +
73754 +
73755 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
73756 +                  grant_handle_t *handle, void *vaddr)
73757 +{
73758 +       struct gnttab_map_grant_ref op;
73759 +       
73760 +       gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
73761 +                         gnt_ref, dev->otherend_id);
73762 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
73763 +
73764 +       if (op.status != GNTST_okay) {
73765 +               xenbus_dev_fatal(dev, op.status,
73766 +                                "mapping in shared page %d from domain %d",
73767 +                                gnt_ref, dev->otherend_id);
73768 +       } else
73769 +               *handle = op.handle;
73770 +
73771 +       return op.status;
73772 +}
73773 +EXPORT_SYMBOL_GPL(xenbus_map_ring);
73774 +
73775 +
73776 +/* Based on Rusty Russell's skeleton driver's unmap_page */
73777 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
73778 +{
73779 +       struct gnttab_unmap_grant_ref op;
73780 +
73781 +       gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
73782 +                           (grant_handle_t)area->phys_addr);
73783 +
73784 +       lock_vm_area(area);
73785 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
73786 +       unlock_vm_area(area);
73787 +
73788 +       if (op.status == GNTST_okay)
73789 +               free_vm_area(area);
73790 +       else
73791 +               xenbus_dev_error(dev, op.status,
73792 +                                "unmapping page at handle %d error %d",
73793 +                                (int16_t)area->phys_addr, op.status);
73794 +
73795 +       return op.status;
73796 +}
73797 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
73798 +
73799 +
73800 +int xenbus_unmap_ring(struct xenbus_device *dev,
73801 +                    grant_handle_t handle, void *vaddr)
73802 +{
73803 +       struct gnttab_unmap_grant_ref op;
73804 +
73805 +       gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
73806 +                           handle);
73807 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
73808 +
73809 +       if (op.status != GNTST_okay)
73810 +               xenbus_dev_error(dev, op.status,
73811 +                                "unmapping page at handle %d error %d",
73812 +                                handle, op.status);
73813 +
73814 +       return op.status;
73815 +}
73816 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
73817 +
73818 +int xenbus_dev_is_online(struct xenbus_device *dev)
73819 +{
73820 +       int rc, val;
73821 +
73822 +       rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
73823 +       if (rc != 1)
73824 +               val = 0; /* no online node present */
73825 +
73826 +       return val;
73827 +}
73828 +EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
73829 +
73830 +MODULE_LICENSE("Dual BSD/GPL");
73831 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_client.c linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_client.c
73832 --- linux-2.6.19/drivers/xen/xenbus/xenbus_client.c     1970-01-01 00:00:00.000000000 +0000
73833 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_client.c   2007-02-02 19:10:45.000000000 +0000
73834 @@ -0,0 +1,305 @@
73835 +/******************************************************************************
73836 + * Client-facing interface for the Xenbus driver.  In other words, the
73837 + * interface between the Xenbus and the device-specific code, be it the
73838 + * frontend or the backend of that driver.
73839 + *
73840 + * Copyright (C) 2005 XenSource Ltd
73841 + * 
73842 + * This program is free software; you can redistribute it and/or
73843 + * modify it under the terms of the GNU General Public License version 2
73844 + * as published by the Free Software Foundation; or, when distributed
73845 + * separately from the Linux kernel or incorporated into other
73846 + * software packages, subject to the following license:
73847 + * 
73848 + * Permission is hereby granted, free of charge, to any person obtaining a copy
73849 + * of this source file (the "Software"), to deal in the Software without
73850 + * restriction, including without limitation the rights to use, copy, modify,
73851 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
73852 + * and to permit persons to whom the Software is furnished to do so, subject to
73853 + * the following conditions:
73854 + * 
73855 + * The above copyright notice and this permission notice shall be included in
73856 + * all copies or substantial portions of the Software.
73857 + * 
73858 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
73859 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
73860 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
73861 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
73862 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
73863 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
73864 + * IN THE SOFTWARE.
73865 + */
73866 +
73867 +#include <xen/evtchn.h>
73868 +#include <xen/gnttab.h>
73869 +#include <xen/xenbus.h>
73870 +#include <xen/driver_util.h>
73871 +
73872 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
73873 +#include <xen/platform-compat.h>
73874 +#endif
73875 +
73876 +#define DPRINTK(fmt, args...) \
73877 +    pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
73878 +
73879 +char *xenbus_strstate(enum xenbus_state state)
73880 +{
73881 +       static char *name[] = {
73882 +               [ XenbusStateUnknown      ] = "Unknown",
73883 +               [ XenbusStateInitialising ] = "Initialising",
73884 +               [ XenbusStateInitWait     ] = "InitWait",
73885 +               [ XenbusStateInitialised  ] = "Initialised",
73886 +               [ XenbusStateConnected    ] = "Connected",
73887 +               [ XenbusStateClosing      ] = "Closing",
73888 +               [ XenbusStateClosed       ] = "Closed",
73889 +       };
73890 +       return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
73891 +}
73892 +EXPORT_SYMBOL_GPL(xenbus_strstate);
73893 +
73894 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
73895 +                     struct xenbus_watch *watch,
73896 +                     void (*callback)(struct xenbus_watch *,
73897 +                                      const char **, unsigned int))
73898 +{
73899 +       int err;
73900 +
73901 +       watch->node = path;
73902 +       watch->callback = callback;
73903 +
73904 +       err = register_xenbus_watch(watch);
73905 +
73906 +       if (err) {
73907 +               watch->node = NULL;
73908 +               watch->callback = NULL;
73909 +               xenbus_dev_fatal(dev, err, "adding watch on %s", path);
73910 +       }
73911 +
73912 +       return err;
73913 +}
73914 +EXPORT_SYMBOL_GPL(xenbus_watch_path);
73915 +
73916 +
73917 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
73918 +                      const char *path2, struct xenbus_watch *watch,
73919 +                      void (*callback)(struct xenbus_watch *,
73920 +                                       const char **, unsigned int))
73921 +{
73922 +       int err;
73923 +       char *state = kasprintf(GFP_KERNEL, "%s/%s", path, path2);
73924 +       if (!state) {
73925 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
73926 +               return -ENOMEM;
73927 +       }
73928 +       err = xenbus_watch_path(dev, state, watch, callback);
73929 +
73930 +       if (err)
73931 +               kfree(state);
73932 +       return err;
73933 +}
73934 +EXPORT_SYMBOL_GPL(xenbus_watch_path2);
73935 +
73936 +
73937 +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
73938 +{
73939 +       /* We check whether the state is currently set to the given value, and
73940 +          if not, then the state is set.  We don't want to unconditionally
73941 +          write the given state, because we don't want to fire watches
73942 +          unnecessarily.  Furthermore, if the node has gone, we don't write
73943 +          to it, as the device will be tearing down, and we don't want to
73944 +          resurrect that directory.
73945 +
73946 +          Note that, because of this cached value of our state, this function
73947 +          will not work inside a Xenstore transaction (something it was
73948 +          trying to in the past) because dev->state would not get reset if
73949 +          the transaction was aborted.
73950 +
73951 +        */
73952 +
73953 +       int current_state;
73954 +       int err;
73955 +
73956 +       if (state == dev->state)
73957 +               return 0;
73958 +
73959 +       err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
73960 +                          &current_state);
73961 +       if (err != 1)
73962 +               return 0;
73963 +
73964 +       err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
73965 +       if (err) {
73966 +               if (state != XenbusStateClosing) /* Avoid looping */
73967 +                       xenbus_dev_fatal(dev, err, "writing new state");
73968 +               return err;
73969 +       }
73970 +
73971 +       dev->state = state;
73972 +
73973 +       return 0;
73974 +}
73975 +EXPORT_SYMBOL_GPL(xenbus_switch_state);
73976 +
73977 +int xenbus_frontend_closed(struct xenbus_device *dev)
73978 +{
73979 +       xenbus_switch_state(dev, XenbusStateClosed);
73980 +       complete(&dev->down);
73981 +       return 0;
73982 +}
73983 +EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
73984 +
73985 +/**
73986 + * Return the path to the error node for the given device, or NULL on failure.
73987 + * If the value returned is non-NULL, then it is the caller's to kfree.
73988 + */
73989 +static char *error_path(struct xenbus_device *dev)
73990 +{
73991 +       return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
73992 +}
73993 +
73994 +
73995 +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
73996 +               va_list ap)
73997 +{
73998 +       int ret;
73999 +       unsigned int len;
74000 +       char *printf_buffer = NULL, *path_buffer = NULL;
74001 +
74002 +#define PRINTF_BUFFER_SIZE 4096
74003 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
74004 +       if (printf_buffer == NULL)
74005 +               goto fail;
74006 +
74007 +       len = sprintf(printf_buffer, "%i ", -err);
74008 +       ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
74009 +
74010 +       BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
74011 +
74012 +       dev_err(&dev->dev, "%s\n", printf_buffer);
74013 +
74014 +       path_buffer = error_path(dev);
74015 +
74016 +       if (path_buffer == NULL) {
74017 +               printk("xenbus: failed to write error node for %s (%s)\n",
74018 +                      dev->nodename, printf_buffer);
74019 +               goto fail;
74020 +       }
74021 +
74022 +       if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
74023 +               printk("xenbus: failed to write error node for %s (%s)\n",
74024 +                      dev->nodename, printf_buffer);
74025 +               goto fail;
74026 +       }
74027 +
74028 +fail:
74029 +       if (printf_buffer)
74030 +               kfree(printf_buffer);
74031 +       if (path_buffer)
74032 +               kfree(path_buffer);
74033 +}
74034 +
74035 +
74036 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
74037 +                     ...)
74038 +{
74039 +       va_list ap;
74040 +
74041 +       va_start(ap, fmt);
74042 +       _dev_error(dev, err, fmt, ap);
74043 +       va_end(ap);
74044 +}
74045 +EXPORT_SYMBOL_GPL(xenbus_dev_error);
74046 +
74047 +
74048 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
74049 +                     ...)
74050 +{
74051 +       va_list ap;
74052 +
74053 +       va_start(ap, fmt);
74054 +       _dev_error(dev, err, fmt, ap);
74055 +       va_end(ap);
74056 +
74057 +       xenbus_switch_state(dev, XenbusStateClosing);
74058 +}
74059 +EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
74060 +
74061 +
74062 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
74063 +{
74064 +       int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
74065 +       if (err < 0)
74066 +               xenbus_dev_fatal(dev, err, "granting access to ring page");
74067 +       return err;
74068 +}
74069 +EXPORT_SYMBOL_GPL(xenbus_grant_ring);
74070 +
74071 +
74072 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
74073 +{
74074 +       struct evtchn_alloc_unbound alloc_unbound;
74075 +       int err;
74076 +
74077 +       alloc_unbound.dom        = DOMID_SELF;
74078 +       alloc_unbound.remote_dom = dev->otherend_id;
74079 +
74080 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
74081 +                                         &alloc_unbound);
74082 +       if (err)
74083 +               xenbus_dev_fatal(dev, err, "allocating event channel");
74084 +       else
74085 +               *port = alloc_unbound.port;
74086 +
74087 +       return err;
74088 +}
74089 +EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
74090 +
74091 +
74092 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
74093 +{
74094 +       struct evtchn_bind_interdomain bind_interdomain;
74095 +       int err;
74096 +
74097 +       bind_interdomain.remote_dom  = dev->otherend_id;
74098 +       bind_interdomain.remote_port = remote_port,
74099 +
74100 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
74101 +                                         &bind_interdomain);
74102 +       if (err)
74103 +               xenbus_dev_fatal(dev, err,
74104 +                                "binding to event channel %d from domain %d",
74105 +                                remote_port, dev->otherend_id);
74106 +       else
74107 +               *port = bind_interdomain.local_port;
74108 +
74109 +       return err;
74110 +}
74111 +EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
74112 +
74113 +
74114 +int xenbus_free_evtchn(struct xenbus_device *dev, int port)
74115 +{
74116 +       struct evtchn_close close;
74117 +       int err;
74118 +
74119 +       close.port = port;
74120 +
74121 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
74122 +       if (err)
74123 +               xenbus_dev_error(dev, err, "freeing event channel %d", port);
74124 +
74125 +       return err;
74126 +}
74127 +EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
74128 +
74129 +
74130 +enum xenbus_state xenbus_read_driver_state(const char *path)
74131 +{
74132 +       enum xenbus_state result;
74133 +       int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
74134 +       if (err)
74135 +               result = XenbusStateUnknown;
74136 +
74137 +       return result;
74138 +}
74139 +EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
74140 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_comms.c linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_comms.c
74141 --- linux-2.6.19/drivers/xen/xenbus/xenbus_comms.c      1970-01-01 00:00:00.000000000 +0000
74142 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_comms.c    2007-02-02 19:10:45.000000000 +0000
74143 @@ -0,0 +1,210 @@
74144 +/******************************************************************************
74145 + * xenbus_comms.c
74146 + *
74147 + * Low level code to talks to Xen Store: ringbuffer and event channel.
74148 + *
74149 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
74150 + * 
74151 + * This program is free software; you can redistribute it and/or
74152 + * modify it under the terms of the GNU General Public License version 2
74153 + * as published by the Free Software Foundation; or, when distributed
74154 + * separately from the Linux kernel or incorporated into other
74155 + * software packages, subject to the following license:
74156 + * 
74157 + * Permission is hereby granted, free of charge, to any person obtaining a copy
74158 + * of this source file (the "Software"), to deal in the Software without
74159 + * restriction, including without limitation the rights to use, copy, modify,
74160 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
74161 + * and to permit persons to whom the Software is furnished to do so, subject to
74162 + * the following conditions:
74163 + * 
74164 + * The above copyright notice and this permission notice shall be included in
74165 + * all copies or substantial portions of the Software.
74166 + * 
74167 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
74168 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
74169 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
74170 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
74171 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
74172 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
74173 + * IN THE SOFTWARE.
74174 + */
74175 +
74176 +#include <linux/wait.h>
74177 +#include <linux/interrupt.h>
74178 +#include <linux/sched.h>
74179 +#include <linux/err.h>
74180 +#include <linux/ptrace.h>
74181 +#include <xen/evtchn.h>
74182 +#include <xen/xenbus.h>
74183 +
74184 +#include <asm/hypervisor.h>
74185 +
74186 +#include "xenbus_comms.h"
74187 +
74188 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
74189 +#include <xen/platform-compat.h>
74190 +#endif
74191 +
74192 +static int xenbus_irq;
74193 +
74194 +extern void xenbus_probe(void *);
74195 +extern int xenstored_ready;
74196 +static DECLARE_WORK(probe_work, xenbus_probe, NULL);
74197 +
74198 +static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
74199 +
74200 +static irqreturn_t wake_waiting(int irq, void *unused)
74201 +{
74202 +       if (unlikely(xenstored_ready == 0)) {
74203 +               xenstored_ready = 1;
74204 +               schedule_work(&probe_work);
74205 +       }
74206 +
74207 +       wake_up(&xb_waitq);
74208 +       return IRQ_HANDLED;
74209 +}
74210 +
74211 +static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
74212 +{
74213 +       return ((prod - cons) <= XENSTORE_RING_SIZE);
74214 +}
74215 +
74216 +static void *get_output_chunk(XENSTORE_RING_IDX cons,
74217 +                             XENSTORE_RING_IDX prod,
74218 +                             char *buf, uint32_t *len)
74219 +{
74220 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
74221 +       if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
74222 +               *len = XENSTORE_RING_SIZE - (prod - cons);
74223 +       return buf + MASK_XENSTORE_IDX(prod);
74224 +}
74225 +
74226 +static const void *get_input_chunk(XENSTORE_RING_IDX cons,
74227 +                                  XENSTORE_RING_IDX prod,
74228 +                                  const char *buf, uint32_t *len)
74229 +{
74230 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
74231 +       if ((prod - cons) < *len)
74232 +               *len = prod - cons;
74233 +       return buf + MASK_XENSTORE_IDX(cons);
74234 +}
74235 +
74236 +int xb_write(const void *data, unsigned len)
74237 +{
74238 +       struct xenstore_domain_interface *intf = xen_store_interface;
74239 +       XENSTORE_RING_IDX cons, prod;
74240 +       int rc;
74241 +
74242 +       while (len != 0) {
74243 +               void *dst;
74244 +               unsigned int avail;
74245 +
74246 +               rc = wait_event_interruptible(
74247 +                       xb_waitq,
74248 +                       (intf->req_prod - intf->req_cons) !=
74249 +                       XENSTORE_RING_SIZE);
74250 +               if (rc < 0)
74251 +                       return rc;
74252 +
74253 +               /* Read indexes, then verify. */
74254 +               cons = intf->req_cons;
74255 +               prod = intf->req_prod;
74256 +               mb();
74257 +               if (!check_indexes(cons, prod)) {
74258 +                       intf->req_cons = intf->req_prod = 0;
74259 +                       return -EIO;
74260 +               }
74261 +
74262 +               dst = get_output_chunk(cons, prod, intf->req, &avail);
74263 +               if (avail == 0)
74264 +                       continue;
74265 +               if (avail > len)
74266 +                       avail = len;
74267 +
74268 +               memcpy(dst, data, avail);
74269 +               data += avail;
74270 +               len -= avail;
74271 +
74272 +               /* Other side must not see new header until data is there. */
74273 +               wmb();
74274 +               intf->req_prod += avail;
74275 +
74276 +               /* This implies mb() before other side sees interrupt. */
74277 +               notify_remote_via_evtchn(xen_store_evtchn);
74278 +       }
74279 +
74280 +       return 0;
74281 +}
74282 +
74283 +int xb_read(void *data, unsigned len)
74284 +{
74285 +       struct xenstore_domain_interface *intf = xen_store_interface;
74286 +       XENSTORE_RING_IDX cons, prod;
74287 +       int rc;
74288 +
74289 +       while (len != 0) {
74290 +               unsigned int avail;
74291 +               const char *src;
74292 +
74293 +               rc = wait_event_interruptible(
74294 +                       xb_waitq,
74295 +                       intf->rsp_cons != intf->rsp_prod);
74296 +               if (rc < 0)
74297 +                       return rc;
74298 +
74299 +               /* Read indexes, then verify. */
74300 +               cons = intf->rsp_cons;
74301 +               prod = intf->rsp_prod;
74302 +               mb();
74303 +               if (!check_indexes(cons, prod)) {
74304 +                       intf->rsp_cons = intf->rsp_prod = 0;
74305 +                       return -EIO;
74306 +               }
74307 +
74308 +               src = get_input_chunk(cons, prod, intf->rsp, &avail);
74309 +               if (avail == 0)
74310 +                       continue;
74311 +               if (avail > len)
74312 +                       avail = len;
74313 +
74314 +               /* We must read header before we read data. */
74315 +               rmb();
74316 +
74317 +               memcpy(data, src, avail);
74318 +               data += avail;
74319 +               len -= avail;
74320 +
74321 +               /* Other side must not see free space until we've copied out */
74322 +               mb();
74323 +               intf->rsp_cons += avail;
74324 +
74325 +               pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
74326 +
74327 +               /* Implies mb(): they will see new header. */
74328 +               notify_remote_via_evtchn(xen_store_evtchn);
74329 +       }
74330 +
74331 +       return 0;
74332 +}
74333 +
74334 +/* Set up interrupt handler off store event channel. */
74335 +int xb_init_comms(void)
74336 +{
74337 +       int err;
74338 +
74339 +       if (xenbus_irq)
74340 +               unbind_from_irqhandler(xenbus_irq, &xb_waitq);
74341 +
74342 +       err = bind_evtchn_to_irqhandler(
74343 +               xen_store_evtchn, wake_waiting,
74344 +               0, "xenbus", &xb_waitq);
74345 +       if (err <= 0) {
74346 +               printk(KERN_ERR "XENBUS request irq failed %i\n", err);
74347 +               return err;
74348 +       }
74349 +
74350 +       xenbus_irq = err;
74351 +
74352 +       return 0;
74353 +}
74354 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_comms.h linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_comms.h
74355 --- linux-2.6.19/drivers/xen/xenbus/xenbus_comms.h      1970-01-01 00:00:00.000000000 +0000
74356 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_comms.h    2007-02-02 19:10:45.000000000 +0000
74357 @@ -0,0 +1,44 @@
74358 +/*
74359 + * Private include for xenbus communications.
74360 + * 
74361 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
74362 + *
74363 + * This program is free software; you can redistribute it and/or
74364 + * modify it under the terms of the GNU General Public License version 2
74365 + * as published by the Free Software Foundation; or, when distributed
74366 + * separately from the Linux kernel or incorporated into other
74367 + * software packages, subject to the following license:
74368 + * 
74369 + * Permission is hereby granted, free of charge, to any person obtaining a copy
74370 + * of this source file (the "Software"), to deal in the Software without
74371 + * restriction, including without limitation the rights to use, copy, modify,
74372 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
74373 + * and to permit persons to whom the Software is furnished to do so, subject to
74374 + * the following conditions:
74375 + * 
74376 + * The above copyright notice and this permission notice shall be included in
74377 + * all copies or substantial portions of the Software.
74378 + * 
74379 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
74380 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
74381 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
74382 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
74383 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
74384 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
74385 + * IN THE SOFTWARE.
74386 + */
74387 +
74388 +#ifndef _XENBUS_COMMS_H
74389 +#define _XENBUS_COMMS_H
74390 +
74391 +int xs_init(void);
74392 +int xb_init_comms(void);
74393 +
74394 +/* Low level routines. */
74395 +int xb_write(const void *data, unsigned len);
74396 +int xb_read(void *data, unsigned len);
74397 +int xs_input_avail(void);
74398 +extern struct xenstore_domain_interface *xen_store_interface;
74399 +extern int xen_store_evtchn;
74400 +
74401 +#endif /* _XENBUS_COMMS_H */
74402 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_dev.c linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_dev.c
74403 --- linux-2.6.19/drivers/xen/xenbus/xenbus_dev.c        1970-01-01 00:00:00.000000000 +0000
74404 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_dev.c      2007-02-02 19:10:45.000000000 +0000
74405 @@ -0,0 +1,361 @@
74406 +/*
74407 + * xenbus_dev.c
74408 + * 
74409 + * Driver giving user-space access to the kernel's xenbus connection
74410 + * to xenstore.
74411 + * 
74412 + * Copyright (c) 2005, Christian Limpach
74413 + * Copyright (c) 2005, Rusty Russell, IBM Corporation
74414 + * 
74415 + * This program is free software; you can redistribute it and/or
74416 + * modify it under the terms of the GNU General Public License version 2
74417 + * as published by the Free Software Foundation; or, when distributed
74418 + * separately from the Linux kernel or incorporated into other
74419 + * software packages, subject to the following license:
74420 + * 
74421 + * Permission is hereby granted, free of charge, to any person obtaining a copy
74422 + * of this source file (the "Software"), to deal in the Software without
74423 + * restriction, including without limitation the rights to use, copy, modify,
74424 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
74425 + * and to permit persons to whom the Software is furnished to do so, subject to
74426 + * the following conditions:
74427 + * 
74428 + * The above copyright notice and this permission notice shall be included in
74429 + * all copies or substantial portions of the Software.
74430 + * 
74431 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
74432 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
74433 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
74434 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
74435 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
74436 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
74437 + * IN THE SOFTWARE.
74438 + */
74439 +
74440 +#include <linux/kernel.h>
74441 +#include <linux/errno.h>
74442 +#include <linux/uio.h>
74443 +#include <linux/notifier.h>
74444 +#include <linux/wait.h>
74445 +#include <linux/fs.h>
74446 +#include <linux/poll.h>
74447 +#include <linux/mutex.h>
74448 +
74449 +#include "xenbus_comms.h"
74450 +
74451 +#include <asm/uaccess.h>
74452 +#include <asm/hypervisor.h>
74453 +#include <xen/xenbus.h>
74454 +#include <xen/xen_proc.h>
74455 +#include <asm/hypervisor.h>
74456 +
74457 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
74458 +#include <xen/platform-compat.h>
74459 +#endif
74460 +
74461 +struct xenbus_dev_transaction {
74462 +       struct list_head list;
74463 +       struct xenbus_transaction handle;
74464 +};
74465 +
74466 +struct xenbus_dev_data {
74467 +       /* In-progress transaction. */
74468 +       struct list_head transactions;
74469 +
74470 +       /* Active watches. */
74471 +       struct list_head watches;
74472 +
74473 +       /* Partial request. */
74474 +       unsigned int len;
74475 +       union {
74476 +               struct xsd_sockmsg msg;
74477 +               char buffer[PAGE_SIZE];
74478 +       } u;
74479 +
74480 +       /* Response queue. */
74481 +#define MASK_READ_IDX(idx) ((idx)&(PAGE_SIZE-1))
74482 +       char read_buffer[PAGE_SIZE];
74483 +       unsigned int read_cons, read_prod;
74484 +       wait_queue_head_t read_waitq;
74485 +
74486 +       struct mutex reply_mutex;
74487 +};
74488 +
74489 +static struct proc_dir_entry *xenbus_dev_intf;
74490 +
74491 +static ssize_t xenbus_dev_read(struct file *filp,
74492 +                              char __user *ubuf,
74493 +                              size_t len, loff_t *ppos)
74494 +{
74495 +       struct xenbus_dev_data *u = filp->private_data;
74496 +       int i;
74497 +
74498 +       if (wait_event_interruptible(u->read_waitq,
74499 +                                    u->read_prod != u->read_cons))
74500 +               return -EINTR;
74501 +
74502 +       for (i = 0; i < len; i++) {
74503 +               if (u->read_cons == u->read_prod)
74504 +                       break;
74505 +               put_user(u->read_buffer[MASK_READ_IDX(u->read_cons)], ubuf+i);
74506 +               u->read_cons++;
74507 +       }
74508 +
74509 +       return i;
74510 +}
74511 +
74512 +static void queue_reply(struct xenbus_dev_data *u,
74513 +                       char *data, unsigned int len)
74514 +{
74515 +       int i;
74516 +
74517 +       mutex_lock(&u->reply_mutex);
74518 +
74519 +       for (i = 0; i < len; i++, u->read_prod++)
74520 +               u->read_buffer[MASK_READ_IDX(u->read_prod)] = data[i];
74521 +
74522 +       BUG_ON((u->read_prod - u->read_cons) > sizeof(u->read_buffer));
74523 +
74524 +       mutex_unlock(&u->reply_mutex);
74525 +
74526 +       wake_up(&u->read_waitq);
74527 +}
74528 +
74529 +struct watch_adapter
74530 +{
74531 +       struct list_head list;
74532 +       struct xenbus_watch watch;
74533 +       struct xenbus_dev_data *dev_data;
74534 +       char *token;
74535 +};
74536 +
74537 +static void free_watch_adapter (struct watch_adapter *watch)
74538 +{
74539 +       kfree(watch->watch.node);
74540 +       kfree(watch->token);
74541 +       kfree(watch);
74542 +}
74543 +
74544 +static void watch_fired(struct xenbus_watch *watch,
74545 +                       const char **vec,
74546 +                       unsigned int len)
74547 +{
74548 +       struct watch_adapter *adap =
74549 +            container_of(watch, struct watch_adapter, watch);
74550 +       struct xsd_sockmsg hdr;
74551 +       const char *path, *token;
74552 +       int path_len, tok_len, body_len;
74553 +
74554 +       path = vec[XS_WATCH_PATH];
74555 +       token = adap->token;
74556 +
74557 +       path_len = strlen(path) + 1;
74558 +       tok_len = strlen(token) + 1;
74559 +       body_len = path_len + tok_len;
74560 +
74561 +       hdr.type = XS_WATCH_EVENT;
74562 +       hdr.len = body_len;
74563 +       
74564 +       queue_reply(adap->dev_data, (char *)&hdr, sizeof(hdr));
74565 +       queue_reply(adap->dev_data, (char *)path, path_len);
74566 +       queue_reply(adap->dev_data, (char *)token, tok_len);
74567 +}
74568 +
74569 +static LIST_HEAD(watch_list);
74570 +
74571 +static ssize_t xenbus_dev_write(struct file *filp,
74572 +                               const char __user *ubuf,
74573 +                               size_t len, loff_t *ppos)
74574 +{
74575 +       struct xenbus_dev_data *u = filp->private_data;
74576 +       struct xenbus_dev_transaction *trans = NULL;
74577 +       uint32_t msg_type;
74578 +       void *reply;
74579 +       char *path, *token;
74580 +       struct watch_adapter *watch, *tmp_watch;
74581 +       int err;
74582 +
74583 +       if ((len + u->len) > sizeof(u->u.buffer))
74584 +               return -EINVAL;
74585 +
74586 +       if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0)
74587 +               return -EFAULT;
74588 +
74589 +       u->len += len;
74590 +       if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
74591 +               return len;
74592 +
74593 +       msg_type = u->u.msg.type;
74594 +
74595 +       switch (msg_type) {
74596 +       case XS_TRANSACTION_START:
74597 +       case XS_TRANSACTION_END:
74598 +       case XS_DIRECTORY:
74599 +       case XS_READ:
74600 +       case XS_GET_PERMS:
74601 +       case XS_RELEASE:
74602 +       case XS_GET_DOMAIN_PATH:
74603 +       case XS_WRITE:
74604 +       case XS_MKDIR:
74605 +       case XS_RM:
74606 +       case XS_SET_PERMS:
74607 +               if (msg_type == XS_TRANSACTION_START) {
74608 +                       trans = kmalloc(sizeof(*trans), GFP_KERNEL);
74609 +                       if (!trans)
74610 +                               return -ENOMEM;
74611 +               }
74612 +
74613 +               reply = xenbus_dev_request_and_reply(&u->u.msg);
74614 +               if (IS_ERR(reply)) {
74615 +                       kfree(trans);
74616 +                       return PTR_ERR(reply);
74617 +               }
74618 +
74619 +               if (msg_type == XS_TRANSACTION_START) {
74620 +                       trans->handle.id = simple_strtoul(reply, NULL, 0);
74621 +                       list_add(&trans->list, &u->transactions);
74622 +               } else if (msg_type == XS_TRANSACTION_END) {
74623 +                       list_for_each_entry(trans, &u->transactions, list)
74624 +                               if (trans->handle.id == u->u.msg.tx_id)
74625 +                                       break;
74626 +                       BUG_ON(&trans->list == &u->transactions);
74627 +                       list_del(&trans->list);
74628 +                       kfree(trans);
74629 +               }
74630 +               queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
74631 +               queue_reply(u, (char *)reply, u->u.msg.len);
74632 +               kfree(reply);
74633 +               break;
74634 +
74635 +       case XS_WATCH:
74636 +       case XS_UNWATCH:
74637 +               path = u->u.buffer + sizeof(u->u.msg);
74638 +               token = memchr(path, 0, u->u.msg.len);
74639 +               if (token == NULL)
74640 +                       return -EILSEQ;
74641 +               token++;
74642 +
74643 +               if (msg_type == XS_WATCH) {
74644 +                       static const char * XS_WATCH_RESP = "OK";
74645 +                       struct xsd_sockmsg hdr;
74646 +
74647 +                       watch = kmalloc(sizeof(*watch), GFP_KERNEL);
74648 +                       watch->watch.node = kmalloc(strlen(path)+1,
74649 +                                                    GFP_KERNEL);
74650 +                       strcpy((char *)watch->watch.node, path);
74651 +                       watch->watch.callback = watch_fired;
74652 +                       watch->token = kmalloc(strlen(token)+1, GFP_KERNEL);
74653 +                       strcpy(watch->token, token);
74654 +                       watch->dev_data = u;
74655 +
74656 +                       err = register_xenbus_watch(&watch->watch);
74657 +                       if (err) {
74658 +                               free_watch_adapter(watch);
74659 +                               return err;
74660 +                       }
74661 +                       
74662 +                       list_add(&watch->list, &u->watches);
74663 +
74664 +                       hdr.type = XS_WATCH;
74665 +                       hdr.len = strlen(XS_WATCH_RESP) + 1;
74666 +                       queue_reply(u, (char *)&hdr, sizeof(hdr));
74667 +                       queue_reply(u, (char *)XS_WATCH_RESP, hdr.len);
74668 +               } else {
74669 +                       list_for_each_entry_safe(watch, tmp_watch,
74670 +                                                 &u->watches, list) {
74671 +                               if (!strcmp(watch->token, token) &&
74672 +                                   !strcmp(watch->watch.node, path))
74673 +                                       break;
74674 +                               {
74675 +                                       unregister_xenbus_watch(&watch->watch);
74676 +                                       list_del(&watch->list);
74677 +                                       free_watch_adapter(watch);
74678 +                                       break;
74679 +                               }
74680 +                       }
74681 +               }
74682 +
74683 +               break;
74684 +
74685 +       default:
74686 +               return -EINVAL;
74687 +       }
74688 +
74689 +       u->len = 0;
74690 +       return len;
74691 +}
74692 +
74693 +static int xenbus_dev_open(struct inode *inode, struct file *filp)
74694 +{
74695 +       struct xenbus_dev_data *u;
74696 +
74697 +       if (xen_store_evtchn == 0)
74698 +               return -ENOENT;
74699 +
74700 +       nonseekable_open(inode, filp);
74701 +
74702 +       u = kzalloc(sizeof(*u), GFP_KERNEL);
74703 +       if (u == NULL)
74704 +               return -ENOMEM;
74705 +
74706 +       INIT_LIST_HEAD(&u->transactions);
74707 +       INIT_LIST_HEAD(&u->watches);
74708 +       init_waitqueue_head(&u->read_waitq);
74709 +
74710 +       mutex_init(&u->reply_mutex);
74711 +
74712 +       filp->private_data = u;
74713 +
74714 +       return 0;
74715 +}
74716 +
74717 +static int xenbus_dev_release(struct inode *inode, struct file *filp)
74718 +{
74719 +       struct xenbus_dev_data *u = filp->private_data;
74720 +       struct xenbus_dev_transaction *trans, *tmp;
74721 +       struct watch_adapter *watch, *tmp_watch;
74722 +
74723 +       list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
74724 +               xenbus_transaction_end(trans->handle, 1);
74725 +               list_del(&trans->list);
74726 +               kfree(trans);
74727 +       }
74728 +
74729 +       list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
74730 +               unregister_xenbus_watch(&watch->watch);
74731 +               list_del(&watch->list);
74732 +               free_watch_adapter(watch);
74733 +       }
74734 +
74735 +       kfree(u);
74736 +
74737 +       return 0;
74738 +}
74739 +
74740 +static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
74741 +{
74742 +       struct xenbus_dev_data *u = file->private_data;
74743 +
74744 +       poll_wait(file, &u->read_waitq, wait);
74745 +       if (u->read_cons != u->read_prod)
74746 +               return POLLIN | POLLRDNORM;
74747 +       return 0;
74748 +}
74749 +
74750 +static struct file_operations xenbus_dev_file_ops = {
74751 +       .read = xenbus_dev_read,
74752 +       .write = xenbus_dev_write,
74753 +       .open = xenbus_dev_open,
74754 +       .release = xenbus_dev_release,
74755 +       .poll = xenbus_dev_poll,
74756 +};
74757 +
74758 +int __init
74759 +xenbus_dev_init(void)
74760 +{
74761 +       xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
74762 +       if (xenbus_dev_intf)
74763 +               xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
74764 +
74765 +       return 0;
74766 +}
74767 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_probe.c linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_probe.c
74768 --- linux-2.6.19/drivers/xen/xenbus/xenbus_probe.c      1970-01-01 00:00:00.000000000 +0000
74769 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_probe.c    2007-02-02 19:10:45.000000000 +0000
74770 @@ -0,0 +1,1017 @@
74771 +/******************************************************************************
74772 + * Talks to Xen Store to figure out what devices we have.
74773 + *
74774 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
74775 + * Copyright (C) 2005 Mike Wray, Hewlett-Packard
74776 + * Copyright (C) 2005, 2006 XenSource Ltd
74777 + * 
74778 + * This program is free software; you can redistribute it and/or
74779 + * modify it under the terms of the GNU General Public License version 2
74780 + * as published by the Free Software Foundation; or, when distributed
74781 + * separately from the Linux kernel or incorporated into other
74782 + * software packages, subject to the following license:
74783 + * 
74784 + * Permission is hereby granted, free of charge, to any person obtaining a copy
74785 + * of this source file (the "Software"), to deal in the Software without
74786 + * restriction, including without limitation the rights to use, copy, modify,
74787 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
74788 + * and to permit persons to whom the Software is furnished to do so, subject to
74789 + * the following conditions:
74790 + * 
74791 + * The above copyright notice and this permission notice shall be included in
74792 + * all copies or substantial portions of the Software.
74793 + * 
74794 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
74795 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
74796 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
74797 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
74798 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
74799 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
74800 + * IN THE SOFTWARE.
74801 + */
74802 +
74803 +#define DPRINTK(fmt, args...)                          \
74804 +       pr_debug("xenbus_probe (%s:%d) " fmt ".\n",     \
74805 +                __FUNCTION__, __LINE__, ##args)
74806 +
74807 +#include <linux/kernel.h>
74808 +#include <linux/err.h>
74809 +#include <linux/string.h>
74810 +#include <linux/ctype.h>
74811 +#include <linux/fcntl.h>
74812 +#include <linux/mm.h>
74813 +#include <linux/notifier.h>
74814 +#include <linux/kthread.h>
74815 +#include <linux/mutex.h>
74816 +
74817 +#include <asm/io.h>
74818 +#include <asm/page.h>
74819 +#include <asm/maddr.h>
74820 +#include <asm/pgtable.h>
74821 +#include <asm/hypervisor.h>
74822 +#include <xen/xenbus.h>
74823 +#include <xen/xen_proc.h>
74824 +#include <xen/evtchn.h>
74825 +#include <xen/features.h>
74826 +#include <xen/hvm.h>
74827 +
74828 +#include "xenbus_comms.h"
74829 +#include "xenbus_probe.h"
74830 +
74831 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
74832 +#include <xen/platform-compat.h>
74833 +#endif
74834 +
74835 +int xen_store_evtchn;
74836 +struct xenstore_domain_interface *xen_store_interface;
74837 +static unsigned long xen_store_mfn;
74838 +
74839 +extern struct mutex xenwatch_mutex;
74840 +
74841 +static BLOCKING_NOTIFIER_HEAD(xenstore_notifier_list);
74842 +
74843 +static void wait_for_devices(struct xenbus_driver *xendrv);
74844 +
74845 +static int xenbus_probe_frontend(const char *type, const char *name);
74846 +
74847 +static void xenbus_dev_shutdown(struct device *_dev);
74848 +
74849 +/* If something in array of ids matches this device, return it. */
74850 +static const struct xenbus_device_id *
74851 +match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
74852 +{
74853 +       for (; *arr->devicetype != '\0'; arr++) {
74854 +               if (!strcmp(arr->devicetype, dev->devicetype))
74855 +                       return arr;
74856 +       }
74857 +       return NULL;
74858 +}
74859 +
74860 +int xenbus_match(struct device *_dev, struct device_driver *_drv)
74861 +{
74862 +       struct xenbus_driver *drv = to_xenbus_driver(_drv);
74863 +
74864 +       if (!drv->ids)
74865 +               return 0;
74866 +
74867 +       return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
74868 +}
74869 +
74870 +/* device/<type>/<id> => <type>-<id> */
74871 +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
74872 +{
74873 +       nodename = strchr(nodename, '/');
74874 +       if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
74875 +               printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
74876 +               return -EINVAL;
74877 +       }
74878 +
74879 +       strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
74880 +       if (!strchr(bus_id, '/')) {
74881 +               printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
74882 +               return -EINVAL;
74883 +       }
74884 +       *strchr(bus_id, '/') = '-';
74885 +       return 0;
74886 +}
74887 +
74888 +
74889 +static void free_otherend_details(struct xenbus_device *dev)
74890 +{
74891 +       kfree(dev->otherend);
74892 +       dev->otherend = NULL;
74893 +}
74894 +
74895 +
74896 +static void free_otherend_watch(struct xenbus_device *dev)
74897 +{
74898 +       if (dev->otherend_watch.node) {
74899 +               unregister_xenbus_watch(&dev->otherend_watch);
74900 +               kfree(dev->otherend_watch.node);
74901 +               dev->otherend_watch.node = NULL;
74902 +       }
74903 +}
74904 +
74905 +
74906 +int read_otherend_details(struct xenbus_device *xendev,
74907 +                                char *id_node, char *path_node)
74908 +{
74909 +       int err = xenbus_gather(XBT_NIL, xendev->nodename,
74910 +                               id_node, "%i", &xendev->otherend_id,
74911 +                               path_node, NULL, &xendev->otherend,
74912 +                               NULL);
74913 +       if (err) {
74914 +               xenbus_dev_fatal(xendev, err,
74915 +                                "reading other end details from %s",
74916 +                                xendev->nodename);
74917 +               return err;
74918 +       }
74919 +       if (strlen(xendev->otherend) == 0 ||
74920 +           !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
74921 +               xenbus_dev_fatal(xendev, -ENOENT,
74922 +                                "unable to read other end from %s.  "
74923 +                                "missing or inaccessible.",
74924 +                                xendev->nodename);
74925 +               free_otherend_details(xendev);
74926 +               return -ENOENT;
74927 +       }
74928 +
74929 +       return 0;
74930 +}
74931 +
74932 +
74933 +static int read_backend_details(struct xenbus_device *xendev)
74934 +{
74935 +       return read_otherend_details(xendev, "backend-id", "backend");
74936 +}
74937 +
74938 +
74939 +/* Bus type for frontend drivers. */
74940 +static struct xen_bus_type xenbus_frontend = {
74941 +       .root = "device",
74942 +       .levels = 2,            /* device/type/<id> */
74943 +       .get_bus_id = frontend_bus_id,
74944 +       .probe = xenbus_probe_frontend,
74945 +       .bus = {
74946 +               .name     = "xen",
74947 +               .match    = xenbus_match,
74948 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
74949 +               .probe    = xenbus_dev_probe,
74950 +               .remove   = xenbus_dev_remove,
74951 +               .shutdown = xenbus_dev_shutdown,
74952 +#endif
74953 +       },
74954 +       .dev = {
74955 +               .bus_id = "xen",
74956 +       },
74957 +};
74958 +
74959 +static void otherend_changed(struct xenbus_watch *watch,
74960 +                            const char **vec, unsigned int len)
74961 +{
74962 +       struct xenbus_device *dev =
74963 +               container_of(watch, struct xenbus_device, otherend_watch);
74964 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
74965 +       enum xenbus_state state;
74966 +
74967 +       /* Protect us against watches firing on old details when the otherend
74968 +          details change, say immediately after a resume. */
74969 +       if (!dev->otherend ||
74970 +           strncmp(dev->otherend, vec[XS_WATCH_PATH],
74971 +                   strlen(dev->otherend))) {
74972 +               DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]);
74973 +               return;
74974 +       }
74975 +
74976 +       state = xenbus_read_driver_state(dev->otherend);
74977 +
74978 +       DPRINTK("state is %d (%s), %s, %s", state, xenbus_strstate(state),
74979 +               dev->otherend_watch.node, vec[XS_WATCH_PATH]);
74980 +
74981 +       /*
74982 +        * Ignore xenbus transitions during shutdown. This prevents us doing
74983 +        * work that can fail e.g., when the rootfs is gone.
74984 +        */
74985 +       if (system_state > SYSTEM_RUNNING) {
74986 +               struct xen_bus_type *bus = bus;
74987 +               bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
74988 +               /* If we're frontend, drive the state machine to Closed. */
74989 +               /* This should cause the backend to release our resources. */
74990 +               if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
74991 +                       xenbus_frontend_closed(dev);
74992 +               return;
74993 +       }
74994 +
74995 +       if (drv->otherend_changed)
74996 +               drv->otherend_changed(dev, state);
74997 +}
74998 +
74999 +
75000 +static int talk_to_otherend(struct xenbus_device *dev)
75001 +{
75002 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
75003 +
75004 +       free_otherend_watch(dev);
75005 +       free_otherend_details(dev);
75006 +
75007 +       return drv->read_otherend_details(dev);
75008 +}
75009 +
75010 +
75011 +static int watch_otherend(struct xenbus_device *dev)
75012 +{
75013 +       return xenbus_watch_path2(dev, dev->otherend, "state",
75014 +                                 &dev->otherend_watch, otherend_changed);
75015 +}
75016 +
75017 +
75018 +int xenbus_dev_probe(struct device *_dev)
75019 +{
75020 +       struct xenbus_device *dev = to_xenbus_device(_dev);
75021 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
75022 +       const struct xenbus_device_id *id;
75023 +       int err;
75024 +
75025 +       DPRINTK("%s", dev->nodename);
75026 +
75027 +       if (!drv->probe) {
75028 +               err = -ENODEV;
75029 +               goto fail;
75030 +       }
75031 +
75032 +       id = match_device(drv->ids, dev);
75033 +       if (!id) {
75034 +               err = -ENODEV;
75035 +               goto fail;
75036 +       }
75037 +
75038 +       err = talk_to_otherend(dev);
75039 +       if (err) {
75040 +               printk(KERN_WARNING
75041 +                      "xenbus_probe: talk_to_otherend on %s failed.\n",
75042 +                      dev->nodename);
75043 +               return err;
75044 +       }
75045 +
75046 +       err = drv->probe(dev, id);
75047 +       if (err)
75048 +               goto fail;
75049 +
75050 +       err = watch_otherend(dev);
75051 +       if (err) {
75052 +               printk(KERN_WARNING
75053 +                      "xenbus_probe: watch_otherend on %s failed.\n",
75054 +                      dev->nodename);
75055 +               return err;
75056 +       }
75057 +
75058 +       return 0;
75059 +fail:
75060 +       xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
75061 +       xenbus_switch_state(dev, XenbusStateClosed);
75062 +       return -ENODEV;
75063 +}
75064 +
75065 +int xenbus_dev_remove(struct device *_dev)
75066 +{
75067 +       struct xenbus_device *dev = to_xenbus_device(_dev);
75068 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
75069 +
75070 +       DPRINTK("%s", dev->nodename);
75071 +
75072 +       free_otherend_watch(dev);
75073 +       free_otherend_details(dev);
75074 +
75075 +       if (drv->remove)
75076 +               drv->remove(dev);
75077 +
75078 +       xenbus_switch_state(dev, XenbusStateClosed);
75079 +       return 0;
75080 +}
75081 +
75082 +static void xenbus_dev_shutdown(struct device *_dev)
75083 +{
75084 +       struct xenbus_device *dev = to_xenbus_device(_dev);
75085 +       unsigned long timeout = 5*HZ;
75086 +
75087 +       DPRINTK("%s", dev->nodename);
75088 +
75089 +       get_device(&dev->dev);
75090 +       if (dev->state != XenbusStateConnected) {
75091 +               printk("%s: %s: %s != Connected, skipping\n", __FUNCTION__,
75092 +                      dev->nodename, xenbus_strstate(dev->state));
75093 +               goto out;
75094 +       }
75095 +       xenbus_switch_state(dev, XenbusStateClosing);
75096 +       timeout = wait_for_completion_timeout(&dev->down, timeout);
75097 +       if (!timeout)
75098 +               printk("%s: %s timeout closing device\n", __FUNCTION__, dev->nodename);
75099 + out:
75100 +       put_device(&dev->dev);
75101 +}
75102 +
75103 +int xenbus_register_driver_common(struct xenbus_driver *drv,
75104 +                                 struct xen_bus_type *bus)
75105 +{
75106 +       int ret;
75107 +
75108 +       drv->driver.name = drv->name;
75109 +       drv->driver.bus = &bus->bus;
75110 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
75111 +       drv->driver.owner = drv->owner;
75112 +#endif
75113 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
75114 +       drv->driver.probe = xenbus_dev_probe;
75115 +       drv->driver.remove = xenbus_dev_remove;
75116 +       drv->driver.shutdown = xenbus_dev_shutdown;
75117 +#endif
75118 +
75119 +       mutex_lock(&xenwatch_mutex);
75120 +       ret = driver_register(&drv->driver);
75121 +       mutex_unlock(&xenwatch_mutex);
75122 +       return ret;
75123 +}
75124 +
75125 +int xenbus_register_frontend(struct xenbus_driver *drv)
75126 +{
75127 +       int ret;
75128 +
75129 +       drv->read_otherend_details = read_backend_details;
75130 +
75131 +       ret = xenbus_register_driver_common(drv, &xenbus_frontend);
75132 +       if (ret)
75133 +               return ret;
75134 +
75135 +       /* If this driver is loaded as a module wait for devices to attach. */
75136 +       wait_for_devices(drv);
75137 +
75138 +       return 0;
75139 +}
75140 +EXPORT_SYMBOL_GPL(xenbus_register_frontend);
75141 +
75142 +void xenbus_unregister_driver(struct xenbus_driver *drv)
75143 +{
75144 +       driver_unregister(&drv->driver);
75145 +}
75146 +EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
75147 +
75148 +struct xb_find_info
75149 +{
75150 +       struct xenbus_device *dev;
75151 +       const char *nodename;
75152 +};
75153 +
75154 +static int cmp_dev(struct device *dev, void *data)
75155 +{
75156 +       struct xenbus_device *xendev = to_xenbus_device(dev);
75157 +       struct xb_find_info *info = data;
75158 +
75159 +       if (!strcmp(xendev->nodename, info->nodename)) {
75160 +               info->dev = xendev;
75161 +               get_device(dev);
75162 +               return 1;
75163 +       }
75164 +       return 0;
75165 +}
75166 +
75167 +struct xenbus_device *xenbus_device_find(const char *nodename,
75168 +                                        struct bus_type *bus)
75169 +{
75170 +       struct xb_find_info info = { .dev = NULL, .nodename = nodename };
75171 +
75172 +       bus_for_each_dev(bus, NULL, &info, cmp_dev);
75173 +       return info.dev;
75174 +}
75175 +
75176 +static int cleanup_dev(struct device *dev, void *data)
75177 +{
75178 +       struct xenbus_device *xendev = to_xenbus_device(dev);
75179 +       struct xb_find_info *info = data;
75180 +       int len = strlen(info->nodename);
75181 +
75182 +       DPRINTK("%s", info->nodename);
75183 +
75184 +       /* Match the info->nodename path, or any subdirectory of that path. */
75185 +       if (strncmp(xendev->nodename, info->nodename, len))
75186 +               return 0;
75187 +
75188 +       /* If the node name is longer, ensure it really is a subdirectory. */
75189 +       if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
75190 +               return 0;
75191 +
75192 +       info->dev = xendev;
75193 +       get_device(dev);
75194 +       return 1;
75195 +}
75196 +
75197 +static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
75198 +{
75199 +       struct xb_find_info info = { .nodename = path };
75200 +
75201 +       do {
75202 +               info.dev = NULL;
75203 +               bus_for_each_dev(bus, NULL, &info, cleanup_dev);
75204 +               if (info.dev) {
75205 +                       device_unregister(&info.dev->dev);
75206 +                       put_device(&info.dev->dev);
75207 +               }
75208 +       } while (info.dev);
75209 +}
75210 +
75211 +static void xenbus_dev_release(struct device *dev)
75212 +{
75213 +       if (dev)
75214 +               kfree(to_xenbus_device(dev));
75215 +}
75216 +
75217 +static ssize_t xendev_show_nodename(struct device *dev,
75218 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
75219 +                                   struct device_attribute *attr,
75220 +#endif
75221 +                                   char *buf)
75222 +{
75223 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
75224 +}
75225 +DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
75226 +
75227 +static ssize_t xendev_show_devtype(struct device *dev,
75228 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
75229 +                                  struct device_attribute *attr,
75230 +#endif
75231 +                                  char *buf)
75232 +{
75233 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
75234 +}
75235 +DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
75236 +
75237 +
75238 +int xenbus_probe_node(struct xen_bus_type *bus,
75239 +                     const char *type,
75240 +                     const char *nodename)
75241 +{
75242 +       int err;
75243 +       struct xenbus_device *xendev;
75244 +       size_t stringlen;
75245 +       char *tmpstring;
75246 +
75247 +       enum xenbus_state state = xenbus_read_driver_state(nodename);
75248 +
75249 +       if (state != XenbusStateInitialising) {
75250 +               /* Device is not new, so ignore it.  This can happen if a
75251 +                  device is going away after switching to Closed.  */
75252 +               return 0;
75253 +       }
75254 +
75255 +       stringlen = strlen(nodename) + 1 + strlen(type) + 1;
75256 +       xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
75257 +       if (!xendev)
75258 +               return -ENOMEM;
75259 +
75260 +       xendev->state = XenbusStateInitialising;
75261 +
75262 +       /* Copy the strings into the extra space. */
75263 +
75264 +       tmpstring = (char *)(xendev + 1);
75265 +       strcpy(tmpstring, nodename);
75266 +       xendev->nodename = tmpstring;
75267 +
75268 +       tmpstring += strlen(tmpstring) + 1;
75269 +       strcpy(tmpstring, type);
75270 +       xendev->devicetype = tmpstring;
75271 +       init_completion(&xendev->down);
75272 +
75273 +       xendev->dev.parent = &bus->dev;
75274 +       xendev->dev.bus = &bus->bus;
75275 +       xendev->dev.release = xenbus_dev_release;
75276 +
75277 +       err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
75278 +       if (err)
75279 +               goto fail;
75280 +
75281 +       /* Register with generic device framework. */
75282 +       err = device_register(&xendev->dev);
75283 +       if (err)
75284 +               goto fail;
75285 +
75286 +       device_create_file(&xendev->dev, &dev_attr_nodename);
75287 +       device_create_file(&xendev->dev, &dev_attr_devtype);
75288 +
75289 +       return 0;
75290 +fail:
75291 +       kfree(xendev);
75292 +       return err;
75293 +}
75294 +
75295 +/* device/<typename>/<name> */
75296 +static int xenbus_probe_frontend(const char *type, const char *name)
75297 +{
75298 +       char *nodename;
75299 +       int err;
75300 +
75301 +       nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_frontend.root, type, name);
75302 +       if (!nodename)
75303 +               return -ENOMEM;
75304 +
75305 +       DPRINTK("%s", nodename);
75306 +
75307 +       err = xenbus_probe_node(&xenbus_frontend, type, nodename);
75308 +       kfree(nodename);
75309 +       return err;
75310 +}
75311 +
75312 +static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
75313 +{
75314 +       int err = 0;
75315 +       char **dir;
75316 +       unsigned int dir_n = 0;
75317 +       int i;
75318 +
75319 +       dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
75320 +       if (IS_ERR(dir))
75321 +               return PTR_ERR(dir);
75322 +
75323 +       for (i = 0; i < dir_n; i++) {
75324 +               err = bus->probe(type, dir[i]);
75325 +               if (err)
75326 +                       break;
75327 +       }
75328 +       kfree(dir);
75329 +       return err;
75330 +}
75331 +
75332 +int xenbus_probe_devices(struct xen_bus_type *bus)
75333 +{
75334 +       int err = 0;
75335 +       char **dir;
75336 +       unsigned int i, dir_n;
75337 +
75338 +       dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
75339 +       if (IS_ERR(dir))
75340 +               return PTR_ERR(dir);
75341 +
75342 +       for (i = 0; i < dir_n; i++) {
75343 +               err = xenbus_probe_device_type(bus, dir[i]);
75344 +               if (err)
75345 +                       break;
75346 +       }
75347 +       kfree(dir);
75348 +       return err;
75349 +}
75350 +
75351 +static unsigned int char_count(const char *str, char c)
75352 +{
75353 +       unsigned int i, ret = 0;
75354 +
75355 +       for (i = 0; str[i]; i++)
75356 +               if (str[i] == c)
75357 +                       ret++;
75358 +       return ret;
75359 +}
75360 +
75361 +static int strsep_len(const char *str, char c, unsigned int len)
75362 +{
75363 +       unsigned int i;
75364 +
75365 +       for (i = 0; str[i]; i++)
75366 +               if (str[i] == c) {
75367 +                       if (len == 0)
75368 +                               return i;
75369 +                       len--;
75370 +               }
75371 +       return (len == 0) ? i : -ERANGE;
75372 +}
75373 +
75374 +void dev_changed(const char *node, struct xen_bus_type *bus)
75375 +{
75376 +       int exists, rootlen;
75377 +       struct xenbus_device *dev;
75378 +       char type[BUS_ID_SIZE];
75379 +       const char *p, *root;
75380 +
75381 +       if (char_count(node, '/') < 2)
75382 +               return;
75383 +
75384 +       exists = xenbus_exists(XBT_NIL, node, "");
75385 +       if (!exists) {
75386 +               xenbus_cleanup_devices(node, &bus->bus);
75387 +               return;
75388 +       }
75389 +
75390 +       /* backend/<type>/... or device/<type>/... */
75391 +       p = strchr(node, '/') + 1;
75392 +       snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
75393 +       type[BUS_ID_SIZE-1] = '\0';
75394 +
75395 +       rootlen = strsep_len(node, '/', bus->levels);
75396 +       if (rootlen < 0)
75397 +               return;
75398 +       root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
75399 +       if (!root)
75400 +               return;
75401 +
75402 +       dev = xenbus_device_find(root, &bus->bus);
75403 +       if (!dev)
75404 +               xenbus_probe_node(bus, type, root);
75405 +       else
75406 +               put_device(&dev->dev);
75407 +
75408 +       kfree(root);
75409 +}
75410 +
75411 +static void frontend_changed(struct xenbus_watch *watch,
75412 +                            const char **vec, unsigned int len)
75413 +{
75414 +       DPRINTK("");
75415 +
75416 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
75417 +}
75418 +
75419 +/* We watch for devices appearing and vanishing. */
75420 +static struct xenbus_watch fe_watch = {
75421 +       .node = "device",
75422 +       .callback = frontend_changed,
75423 +};
75424 +
75425 +static int suspend_dev(struct device *dev, void *data)
75426 +{
75427 +       int err = 0;
75428 +       struct xenbus_driver *drv;
75429 +       struct xenbus_device *xdev;
75430 +
75431 +       DPRINTK("");
75432 +
75433 +       if (dev->driver == NULL)
75434 +               return 0;
75435 +       drv = to_xenbus_driver(dev->driver);
75436 +       xdev = container_of(dev, struct xenbus_device, dev);
75437 +       if (drv->suspend)
75438 +               err = drv->suspend(xdev);
75439 +       if (err)
75440 +               printk(KERN_WARNING
75441 +                      "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
75442 +       return 0;
75443 +}
75444 +
75445 +static int resume_dev(struct device *dev, void *data)
75446 +{
75447 +       int err;
75448 +       struct xenbus_driver *drv;
75449 +       struct xenbus_device *xdev;
75450 +
75451 +       DPRINTK("");
75452 +
75453 +       if (dev->driver == NULL)
75454 +               return 0;
75455 +
75456 +       drv = to_xenbus_driver(dev->driver);
75457 +       xdev = container_of(dev, struct xenbus_device, dev);
75458 +
75459 +       err = talk_to_otherend(xdev);
75460 +       if (err) {
75461 +               printk(KERN_WARNING
75462 +                      "xenbus: resume (talk_to_otherend) %s failed: %i\n",
75463 +                      dev->bus_id, err);
75464 +               return err;
75465 +       }
75466 +
75467 +       xdev->state = XenbusStateInitialising;
75468 +
75469 +       if (drv->resume) {
75470 +               err = drv->resume(xdev);
75471 +               if (err) { 
75472 +                       printk(KERN_WARNING
75473 +                              "xenbus: resume %s failed: %i\n", 
75474 +                              dev->bus_id, err);
75475 +                       return err;
75476 +               }
75477 +       }
75478 +
75479 +       err = watch_otherend(xdev);
75480 +       if (err) {
75481 +               printk(KERN_WARNING
75482 +                      "xenbus_probe: resume (watch_otherend) %s failed: "
75483 +                      "%d.\n", dev->bus_id, err);
75484 +               return err;
75485 +       }
75486 +
75487 +       return 0;
75488 +}
75489 +
75490 +void xenbus_suspend(void)
75491 +{
75492 +       DPRINTK("");
75493 +
75494 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
75495 +       xenbus_backend_suspend(suspend_dev);
75496 +       xs_suspend();
75497 +}
75498 +EXPORT_SYMBOL_GPL(xenbus_suspend);
75499 +
75500 +void xenbus_resume(void)
75501 +{
75502 +       xb_init_comms();
75503 +       xs_resume();
75504 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
75505 +       xenbus_backend_resume(resume_dev);
75506 +}
75507 +EXPORT_SYMBOL_GPL(xenbus_resume);
75508 +
75509 +
75510 +/* A flag to determine if xenstored is 'ready' (i.e. has started) */
75511 +int xenstored_ready = 0;
75512 +
75513 +
75514 +int register_xenstore_notifier(struct notifier_block *nb)
75515 +{
75516 +       int ret = 0;
75517 +
75518 +       if (xenstored_ready > 0)
75519 +               ret = nb->notifier_call(nb, 0, NULL);
75520 +       else
75521 +               blocking_notifier_chain_register(&xenstore_notifier_list, nb);
75522 +
75523 +       return ret;
75524 +}
75525 +EXPORT_SYMBOL_GPL(register_xenstore_notifier);
75526 +
75527 +void unregister_xenstore_notifier(struct notifier_block *nb)
75528 +{
75529 +       blocking_notifier_chain_unregister(&xenstore_notifier_list, nb);
75530 +}
75531 +EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
75532 +
75533 +
75534 +void xenbus_probe(void *unused)
75535 +{
75536 +       BUG_ON((xenstored_ready <= 0));
75537 +
75538 +       /* Enumerate devices in xenstore and watch for changes. */
75539 +       xenbus_probe_devices(&xenbus_frontend);
75540 +       register_xenbus_watch(&fe_watch);
75541 +       xenbus_backend_probe_and_watch();
75542 +
75543 +       /* Notify others that xenstore is up */
75544 +       blocking_notifier_call_chain(&xenstore_notifier_list, 0, NULL);
75545 +}
75546 +
75547 +
75548 +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
75549 +static struct file_operations xsd_kva_fops;
75550 +static struct proc_dir_entry *xsd_kva_intf;
75551 +static struct proc_dir_entry *xsd_port_intf;
75552 +
75553 +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
75554 +{
75555 +       size_t size = vma->vm_end - vma->vm_start;
75556 +
75557 +       if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
75558 +               return -EINVAL;
75559 +
75560 +       if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn),
75561 +                           size, vma->vm_page_prot))
75562 +               return -EAGAIN;
75563 +
75564 +       return 0;
75565 +}
75566 +
75567 +static int xsd_kva_read(char *page, char **start, off_t off,
75568 +                       int count, int *eof, void *data)
75569 +{
75570 +       int len;
75571 +
75572 +       len  = sprintf(page, "0x%p", xen_store_interface);
75573 +       *eof = 1;
75574 +       return len;
75575 +}
75576 +
75577 +static int xsd_port_read(char *page, char **start, off_t off,
75578 +                        int count, int *eof, void *data)
75579 +{
75580 +       int len;
75581 +
75582 +       len  = sprintf(page, "%d", xen_store_evtchn);
75583 +       *eof = 1;
75584 +       return len;
75585 +}
75586 +#endif
75587 +
75588 +static int __init xenbus_probe_init(void)
75589 +{
75590 +       int err = 0;
75591 +       unsigned long page = 0;
75592 +
75593 +       DPRINTK("");
75594 +
75595 +       if (!is_running_on_xen())
75596 +               return -ENODEV;
75597 +
75598 +       /* Register ourselves with the kernel bus subsystem */
75599 +       bus_register(&xenbus_frontend.bus);
75600 +       xenbus_backend_bus_register();
75601 +
75602 +       /*
75603 +        * Domain0 doesn't have a store_evtchn or store_mfn yet.
75604 +        */
75605 +       if (is_initial_xendomain()) {
75606 +               struct evtchn_alloc_unbound alloc_unbound;
75607 +
75608 +               /* Allocate page. */
75609 +               page = get_zeroed_page(GFP_KERNEL);
75610 +               if (!page)
75611 +                       return -ENOMEM;
75612 +
75613 +               xen_store_mfn = xen_start_info->store_mfn =
75614 +                       pfn_to_mfn(virt_to_phys((void *)page) >>
75615 +                                  PAGE_SHIFT);
75616 +
75617 +               /* Next allocate a local port which xenstored can bind to */
75618 +               alloc_unbound.dom        = DOMID_SELF;
75619 +               alloc_unbound.remote_dom = 0;
75620 +
75621 +               err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
75622 +                                                 &alloc_unbound);
75623 +               if (err == -ENOSYS)
75624 +                       goto err;
75625 +               BUG_ON(err);
75626 +               xen_store_evtchn = xen_start_info->store_evtchn =
75627 +                       alloc_unbound.port;
75628 +
75629 +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
75630 +               /* And finally publish the above info in /proc/xen */
75631 +               xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
75632 +               if (xsd_kva_intf) {
75633 +                       memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
75634 +                              sizeof(xsd_kva_fops));
75635 +                       xsd_kva_fops.mmap = xsd_kva_mmap;
75636 +                       xsd_kva_intf->proc_fops = &xsd_kva_fops;
75637 +                       xsd_kva_intf->read_proc = xsd_kva_read;
75638 +               }
75639 +               xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
75640 +               if (xsd_port_intf)
75641 +                       xsd_port_intf->read_proc = xsd_port_read;
75642 +#endif
75643 +               xen_store_interface = mfn_to_virt(xen_store_mfn);
75644 +       } else {
75645 +               xenstored_ready = 1;
75646 +#ifdef CONFIG_XEN
75647 +               xen_store_evtchn = xen_start_info->store_evtchn;
75648 +               xen_store_mfn = xen_start_info->store_mfn;
75649 +               xen_store_interface = mfn_to_virt(xen_store_mfn);
75650 +#else
75651 +               xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
75652 +               xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
75653 +               xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT,
75654 +                                             PAGE_SIZE);
75655 +#endif
75656 +       }
75657 +
75658 +
75659 +       xenbus_dev_init();
75660 +
75661 +       /* Initialize the interface to xenstore. */
75662 +       err = xs_init();
75663 +       if (err) {
75664 +               printk(KERN_WARNING
75665 +                      "XENBUS: Error initializing xenstore comms: %i\n", err);
75666 +               goto err;
75667 +       }
75668 +
75669 +       /* Register ourselves with the kernel device subsystem */
75670 +       device_register(&xenbus_frontend.dev);
75671 +       xenbus_backend_device_register();
75672 +
75673 +       if (!is_initial_xendomain())
75674 +               xenbus_probe(NULL);
75675 +
75676 +       return 0;
75677 +
75678 + err:
75679 +       if (page)
75680 +               free_page(page);
75681 +
75682 +       /*
75683 +        * Do not unregister the xenbus front/backend buses here. The buses
75684 +        * must exist because front/backend drivers will use them when they are
75685 +        * registered.
75686 +        */
75687 +
75688 +       return err;
75689 +}
75690 +
75691 +postcore_initcall(xenbus_probe_init);
75692 +
75693 +MODULE_LICENSE("Dual BSD/GPL");
75694 +
75695 +
75696 +static int is_disconnected_device(struct device *dev, void *data)
75697 +{
75698 +       struct xenbus_device *xendev = to_xenbus_device(dev);
75699 +       struct device_driver *drv = data;
75700 +
75701 +       /*
75702 +        * A device with no driver will never connect. We care only about
75703 +        * devices which should currently be in the process of connecting.
75704 +        */
75705 +       if (!dev->driver)
75706 +               return 0;
75707 +
75708 +       /* Is this search limited to a particular driver? */
75709 +       if (drv && (dev->driver != drv))
75710 +               return 0;
75711 +
75712 +       return (xendev->state != XenbusStateConnected);
75713 +}
75714 +
75715 +static int exists_disconnected_device(struct device_driver *drv)
75716 +{
75717 +       return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
75718 +                               is_disconnected_device);
75719 +}
75720 +
75721 +static int print_device_status(struct device *dev, void *data)
75722 +{
75723 +       struct xenbus_device *xendev = to_xenbus_device(dev);
75724 +       struct device_driver *drv = data;
75725 +
75726 +       /* Is this operation limited to a particular driver? */
75727 +       if (drv && (dev->driver != drv))
75728 +               return 0;
75729 +
75730 +       if (!dev->driver) {
75731 +               /* Information only: is this too noisy? */
75732 +               printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
75733 +                      xendev->nodename);
75734 +       } else if (xendev->state != XenbusStateConnected) {
75735 +               printk(KERN_WARNING "XENBUS: Timeout connecting "
75736 +                      "to device: %s (state %d)\n",
75737 +                      xendev->nodename, xendev->state);
75738 +       }
75739 +
75740 +       return 0;
75741 +}
75742 +
75743 +/* We only wait for device setup after most initcalls have run. */
75744 +static int ready_to_wait_for_devices;
75745 +
75746 +/*
75747 + * On a 10 second timeout, wait for all devices currently configured.  We need
75748 + * to do this to guarantee that the filesystems and / or network devices
75749 + * needed for boot are available, before we can allow the boot to proceed.
75750 + *
75751 + * This needs to be on a late_initcall, to happen after the frontend device
75752 + * drivers have been initialised, but before the root fs is mounted.
75753 + *
75754 + * A possible improvement here would be to have the tools add a per-device
75755 + * flag to the store entry, indicating whether it is needed at boot time.
75756 + * This would allow people who knew what they were doing to accelerate their
75757 + * boot slightly, but of course needs tools or manual intervention to set up
75758 + * those flags correctly.
75759 + */
75760 +static void wait_for_devices(struct xenbus_driver *xendrv)
75761 +{
75762 +       unsigned long timeout = jiffies + 10*HZ;
75763 +       struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
75764 +
75765 +       if (!ready_to_wait_for_devices || !is_running_on_xen())
75766 +               return;
75767 +
75768 +       while (exists_disconnected_device(drv)) {
75769 +               if (time_after(jiffies, timeout))
75770 +                       break;
75771 +               schedule_timeout_interruptible(HZ/10);
75772 +       }
75773 +
75774 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
75775 +                        print_device_status);
75776 +}
75777 +
75778 +#ifndef MODULE
75779 +static int __init boot_wait_for_devices(void)
75780 +{
75781 +       ready_to_wait_for_devices = 1;
75782 +       wait_for_devices(NULL);
75783 +       return 0;
75784 +}
75785 +
75786 +late_initcall(boot_wait_for_devices);
75787 +#endif
75788 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_probe.h linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_probe.h
75789 --- linux-2.6.19/drivers/xen/xenbus/xenbus_probe.h      1970-01-01 00:00:00.000000000 +0000
75790 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_probe.h    2007-02-02 19:10:45.000000000 +0000
75791 @@ -0,0 +1,73 @@
75792 +/******************************************************************************
75793 + * xenbus_probe.h
75794 + *
75795 + * Talks to Xen Store to figure out what devices we have.
75796 + *
75797 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
75798 + * Copyright (C) 2005 XenSource Ltd.
75799 + * 
75800 + * This program is free software; you can redistribute it and/or
75801 + * modify it under the terms of the GNU General Public License version 2
75802 + * as published by the Free Software Foundation; or, when distributed
75803 + * separately from the Linux kernel or incorporated into other
75804 + * software packages, subject to the following license:
75805 + * 
75806 + * Permission is hereby granted, free of charge, to any person obtaining a copy
75807 + * of this source file (the "Software"), to deal in the Software without
75808 + * restriction, including without limitation the rights to use, copy, modify,
75809 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
75810 + * and to permit persons to whom the Software is furnished to do so, subject to
75811 + * the following conditions:
75812 + * 
75813 + * The above copyright notice and this permission notice shall be included in
75814 + * all copies or substantial portions of the Software.
75815 + * 
75816 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
75817 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
75818 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
75819 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
75820 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
75821 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
75822 + * IN THE SOFTWARE.
75823 + */
75824 +
75825 +#ifndef _XENBUS_PROBE_H
75826 +#define _XENBUS_PROBE_H
75827 +
75828 +#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
75829 +extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
75830 +extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
75831 +extern void xenbus_backend_probe_and_watch(void);
75832 +extern void xenbus_backend_bus_register(void);
75833 +extern void xenbus_backend_device_register(void);
75834 +#else
75835 +static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
75836 +static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
75837 +static inline void xenbus_backend_probe_and_watch(void) {}
75838 +static inline void xenbus_backend_bus_register(void) {}
75839 +static inline void xenbus_backend_device_register(void) {}
75840 +#endif
75841 +
75842 +struct xen_bus_type
75843 +{
75844 +       char *root;
75845 +       unsigned int levels;
75846 +       int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
75847 +       int (*probe)(const char *type, const char *dir);
75848 +       struct bus_type bus;
75849 +       struct device dev;
75850 +};
75851 +
75852 +extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
75853 +extern int xenbus_dev_probe(struct device *_dev);
75854 +extern int xenbus_dev_remove(struct device *_dev);
75855 +extern int xenbus_register_driver_common(struct xenbus_driver *drv,
75856 +                                        struct xen_bus_type *bus);
75857 +extern int xenbus_probe_node(struct xen_bus_type *bus,
75858 +                            const char *type,
75859 +                            const char *nodename);
75860 +extern int xenbus_probe_devices(struct xen_bus_type *bus);
75861 +
75862 +extern void dev_changed(const char *node, struct xen_bus_type *bus);
75863 +#endif
75864 +
75865 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_probe_backend.c linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_probe_backend.c
75866 --- linux-2.6.19/drivers/xen/xenbus/xenbus_probe_backend.c      1970-01-01 00:00:00.000000000 +0000
75867 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_probe_backend.c    2007-02-02 19:10:45.000000000 +0000
75868 @@ -0,0 +1,271 @@
75869 +/******************************************************************************
75870 + * Talks to Xen Store to figure out what devices we have (backend half).
75871 + *
75872 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
75873 + * Copyright (C) 2005 Mike Wray, Hewlett-Packard
75874 + * Copyright (C) 2005, 2006 XenSource Ltd
75875 + * 
75876 + * This program is free software; you can redistribute it and/or
75877 + * modify it under the terms of the GNU General Public License version 2
75878 + * as published by the Free Software Foundation; or, when distributed
75879 + * separately from the Linux kernel or incorporated into other
75880 + * software packages, subject to the following license:
75881 + * 
75882 + * Permission is hereby granted, free of charge, to any person obtaining a copy
75883 + * of this source file (the "Software"), to deal in the Software without
75884 + * restriction, including without limitation the rights to use, copy, modify,
75885 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
75886 + * and to permit persons to whom the Software is furnished to do so, subject to
75887 + * the following conditions:
75888 + * 
75889 + * The above copyright notice and this permission notice shall be included in
75890 + * all copies or substantial portions of the Software.
75891 + * 
75892 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
75893 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
75894 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
75895 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
75896 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
75897 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
75898 + * IN THE SOFTWARE.
75899 + */
75900 +
75901 +#define DPRINTK(fmt, args...)                          \
75902 +       pr_debug("xenbus_probe (%s:%d) " fmt ".\n",     \
75903 +                __FUNCTION__, __LINE__, ##args)
75904 +
75905 +#include <linux/kernel.h>
75906 +#include <linux/err.h>
75907 +#include <linux/string.h>
75908 +#include <linux/ctype.h>
75909 +#include <linux/fcntl.h>
75910 +#include <linux/mm.h>
75911 +#include <linux/notifier.h>
75912 +#include <linux/kthread.h>
75913 +
75914 +#include <asm/io.h>
75915 +#include <asm/page.h>
75916 +#include <asm/maddr.h>
75917 +#include <asm/pgtable.h>
75918 +#include <asm/hypervisor.h>
75919 +#include <xen/xenbus.h>
75920 +#include <xen/xen_proc.h>
75921 +#include <xen/evtchn.h>
75922 +#include <xen/features.h>
75923 +#include <xen/hvm.h>
75924 +
75925 +#include "xenbus_comms.h"
75926 +#include "xenbus_probe.h"
75927 +
75928 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
75929 +#include <xen/platform-compat.h>
75930 +#endif
75931 +
75932 +static int xenbus_uevent_backend(struct device *dev, char **envp,
75933 +                                int num_envp, char *buffer, int buffer_size);
75934 +static int xenbus_probe_backend(const char *type, const char *domid);
75935 +
75936 +extern int read_otherend_details(struct xenbus_device *xendev,
75937 +                                char *id_node, char *path_node);
75938 +
75939 +static int read_frontend_details(struct xenbus_device *xendev)
75940 +{
75941 +       return read_otherend_details(xendev, "frontend-id", "frontend");
75942 +}
75943 +
75944 +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
75945 +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
75946 +{
75947 +       int domid, err;
75948 +       const char *devid, *type, *frontend;
75949 +       unsigned int typelen;
75950 +
75951 +       type = strchr(nodename, '/');
75952 +       if (!type)
75953 +               return -EINVAL;
75954 +       type++;
75955 +       typelen = strcspn(type, "/");
75956 +       if (!typelen || type[typelen] != '/')
75957 +               return -EINVAL;
75958 +
75959 +       devid = strrchr(nodename, '/') + 1;
75960 +
75961 +       err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
75962 +                           "frontend", NULL, &frontend,
75963 +                           NULL);
75964 +       if (err)
75965 +               return err;
75966 +       if (strlen(frontend) == 0)
75967 +               err = -ERANGE;
75968 +       if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
75969 +               err = -ENOENT;
75970 +       kfree(frontend);
75971 +
75972 +       if (err)
75973 +               return err;
75974 +
75975 +       if (snprintf(bus_id, BUS_ID_SIZE,
75976 +                    "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
75977 +               return -ENOSPC;
75978 +       return 0;
75979 +}
75980 +
75981 +static struct xen_bus_type xenbus_backend = {
75982 +       .root = "backend",
75983 +       .levels = 3,            /* backend/type/<frontend>/<id> */
75984 +       .get_bus_id = backend_bus_id,
75985 +       .probe = xenbus_probe_backend,
75986 +       .bus = {
75987 +               .name     = "xen-backend",
75988 +               .match    = xenbus_match,
75989 +               .probe    = xenbus_dev_probe,
75990 +               .remove   = xenbus_dev_remove,
75991 +//             .shutdown = xenbus_dev_shutdown,
75992 +               .uevent   = xenbus_uevent_backend,
75993 +       },
75994 +       .dev = {
75995 +               .bus_id = "xen-backend",
75996 +       },
75997 +};
75998 +
75999 +static int xenbus_uevent_backend(struct device *dev, char **envp,
76000 +                                int num_envp, char *buffer, int buffer_size)
76001 +{
76002 +       struct xenbus_device *xdev;
76003 +       struct xenbus_driver *drv;
76004 +       int i = 0;
76005 +       int length = 0;
76006 +
76007 +       DPRINTK("");
76008 +
76009 +       if (dev == NULL)
76010 +               return -ENODEV;
76011 +
76012 +       xdev = to_xenbus_device(dev);
76013 +       if (xdev == NULL)
76014 +               return -ENODEV;
76015 +
76016 +       /* stuff we want to pass to /sbin/hotplug */
76017 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
76018 +                      "XENBUS_TYPE=%s", xdev->devicetype);
76019 +
76020 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
76021 +                      "XENBUS_PATH=%s", xdev->nodename);
76022 +
76023 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
76024 +                      "XENBUS_BASE_PATH=%s", xenbus_backend.root);
76025 +
76026 +       /* terminate, set to next free slot, shrink available space */
76027 +       envp[i] = NULL;
76028 +       envp = &envp[i];
76029 +       num_envp -= i;
76030 +       buffer = &buffer[length];
76031 +       buffer_size -= length;
76032 +
76033 +       if (dev->driver) {
76034 +               drv = to_xenbus_driver(dev->driver);
76035 +               if (drv && drv->uevent)
76036 +                       return drv->uevent(xdev, envp, num_envp, buffer,
76037 +                                          buffer_size);
76038 +       }
76039 +
76040 +       return 0;
76041 +}
76042 +
76043 +int xenbus_register_backend(struct xenbus_driver *drv)
76044 +{
76045 +       drv->read_otherend_details = read_frontend_details;
76046 +
76047 +       return xenbus_register_driver_common(drv, &xenbus_backend);
76048 +}
76049 +EXPORT_SYMBOL_GPL(xenbus_register_backend);
76050 +
76051 +/* backend/<typename>/<frontend-uuid>/<name> */
76052 +static int xenbus_probe_backend_unit(const char *dir,
76053 +                                    const char *type,
76054 +                                    const char *name)
76055 +{
76056 +       char *nodename;
76057 +       int err;
76058 +
76059 +       nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
76060 +       if (!nodename)
76061 +               return -ENOMEM;
76062 +
76063 +       DPRINTK("%s\n", nodename);
76064 +
76065 +       err = xenbus_probe_node(&xenbus_backend, type, nodename);
76066 +       kfree(nodename);
76067 +       return err;
76068 +}
76069 +
76070 +/* backend/<typename>/<frontend-domid> */
76071 +static int xenbus_probe_backend(const char *type, const char *domid)
76072 +{
76073 +       char *nodename;
76074 +       int err = 0;
76075 +       char **dir;
76076 +       unsigned int i, dir_n = 0;
76077 +
76078 +       DPRINTK("");
76079 +
76080 +       nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid);
76081 +       if (!nodename)
76082 +               return -ENOMEM;
76083 +
76084 +       dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
76085 +       if (IS_ERR(dir)) {
76086 +               kfree(nodename);
76087 +               return PTR_ERR(dir);
76088 +       }
76089 +
76090 +       for (i = 0; i < dir_n; i++) {
76091 +               err = xenbus_probe_backend_unit(nodename, type, dir[i]);
76092 +               if (err)
76093 +                       break;
76094 +       }
76095 +       kfree(dir);
76096 +       kfree(nodename);
76097 +       return err;
76098 +}
76099 +
76100 +static void backend_changed(struct xenbus_watch *watch,
76101 +                           const char **vec, unsigned int len)
76102 +{
76103 +       DPRINTK("");
76104 +
76105 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
76106 +}
76107 +
76108 +static struct xenbus_watch be_watch = {
76109 +       .node = "backend",
76110 +       .callback = backend_changed,
76111 +};
76112 +
76113 +void xenbus_backend_suspend(int (*fn)(struct device *, void *))
76114 +{
76115 +       DPRINTK("");
76116 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
76117 +}
76118 +
76119 +void xenbus_backend_resume(int (*fn)(struct device *, void *))
76120 +{
76121 +       DPRINTK("");
76122 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
76123 +}
76124 +
76125 +void xenbus_backend_probe_and_watch(void)
76126 +{
76127 +       xenbus_probe_devices(&xenbus_backend);
76128 +       register_xenbus_watch(&be_watch);
76129 +}
76130 +
76131 +void xenbus_backend_bus_register(void)
76132 +{
76133 +       bus_register(&xenbus_backend.bus);
76134 +}
76135 +
76136 +void xenbus_backend_device_register(void)
76137 +{
76138 +       device_register(&xenbus_backend.dev);
76139 +}
76140 diff -ruNp linux-2.6.19/drivers/xen/xenbus/xenbus_xs.c linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_xs.c
76141 --- linux-2.6.19/drivers/xen/xenbus/xenbus_xs.c 1970-01-01 00:00:00.000000000 +0000
76142 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenbus/xenbus_xs.c       2007-02-02 19:10:46.000000000 +0000
76143 @@ -0,0 +1,859 @@
76144 +/******************************************************************************
76145 + * xenbus_xs.c
76146 + *
76147 + * This is the kernel equivalent of the "xs" library.  We don't need everything
76148 + * and we use xenbus_comms for communication.
76149 + *
76150 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
76151 + * 
76152 + * This program is free software; you can redistribute it and/or
76153 + * modify it under the terms of the GNU General Public License version 2
76154 + * as published by the Free Software Foundation; or, when distributed
76155 + * separately from the Linux kernel or incorporated into other
76156 + * software packages, subject to the following license:
76157 + * 
76158 + * Permission is hereby granted, free of charge, to any person obtaining a copy
76159 + * of this source file (the "Software"), to deal in the Software without
76160 + * restriction, including without limitation the rights to use, copy, modify,
76161 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
76162 + * and to permit persons to whom the Software is furnished to do so, subject to
76163 + * the following conditions:
76164 + * 
76165 + * The above copyright notice and this permission notice shall be included in
76166 + * all copies or substantial portions of the Software.
76167 + * 
76168 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
76169 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
76170 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
76171 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
76172 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
76173 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
76174 + * IN THE SOFTWARE.
76175 + */
76176 +
76177 +#include <linux/unistd.h>
76178 +#include <linux/errno.h>
76179 +#include <linux/types.h>
76180 +#include <linux/uio.h>
76181 +#include <linux/kernel.h>
76182 +#include <linux/string.h>
76183 +#include <linux/err.h>
76184 +#include <linux/slab.h>
76185 +#include <linux/fcntl.h>
76186 +#include <linux/kthread.h>
76187 +#include <linux/rwsem.h>
76188 +#include <linux/module.h>
76189 +#include <linux/mutex.h>
76190 +#include <xen/xenbus.h>
76191 +#include "xenbus_comms.h"
76192 +
76193 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
76194 +#include <xen/platform-compat.h>
76195 +#endif
76196 +
76197 +struct xs_stored_msg {
76198 +       struct list_head list;
76199 +
76200 +       struct xsd_sockmsg hdr;
76201 +
76202 +       union {
76203 +               /* Queued replies. */
76204 +               struct {
76205 +                       char *body;
76206 +               } reply;
76207 +
76208 +               /* Queued watch events. */
76209 +               struct {
76210 +                       struct xenbus_watch *handle;
76211 +                       char **vec;
76212 +                       unsigned int vec_size;
76213 +               } watch;
76214 +       } u;
76215 +};
76216 +
76217 +struct xs_handle {
76218 +       /* A list of replies. Currently only one will ever be outstanding. */
76219 +       struct list_head reply_list;
76220 +       spinlock_t reply_lock;
76221 +       wait_queue_head_t reply_waitq;
76222 +
76223 +       /* One request at a time. */
76224 +       struct mutex request_mutex;
76225 +
76226 +       /* Protect transactions against save/restore. */
76227 +       struct rw_semaphore suspend_mutex;
76228 +};
76229 +
76230 +static struct xs_handle xs_state;
76231 +
76232 +/* List of registered watches, and a lock to protect it. */
76233 +static LIST_HEAD(watches);
76234 +static DEFINE_SPINLOCK(watches_lock);
76235 +
76236 +/* List of pending watch callback events, and a lock to protect it. */
76237 +static LIST_HEAD(watch_events);
76238 +static DEFINE_SPINLOCK(watch_events_lock);
76239 +
76240 +/*
76241 + * Details of the xenwatch callback kernel thread. The thread waits on the
76242 + * watch_events_waitq for work to do (queued on watch_events list). When it
76243 + * wakes up it acquires the xenwatch_mutex before reading the list and
76244 + * carrying out work.
76245 + */
76246 +static pid_t xenwatch_pid;
76247 +/* static */ DEFINE_MUTEX(xenwatch_mutex);
76248 +static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
76249 +
76250 +static int get_error(const char *errorstring)
76251 +{
76252 +       unsigned int i;
76253 +
76254 +       for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
76255 +               if (i == ARRAY_SIZE(xsd_errors) - 1) {
76256 +                       printk(KERN_WARNING
76257 +                              "XENBUS xen store gave: unknown error %s",
76258 +                              errorstring);
76259 +                       return EINVAL;
76260 +               }
76261 +       }
76262 +       return xsd_errors[i].errnum;
76263 +}
76264 +
76265 +static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
76266 +{
76267 +       struct xs_stored_msg *msg;
76268 +       char *body;
76269 +
76270 +       spin_lock(&xs_state.reply_lock);
76271 +
76272 +       while (list_empty(&xs_state.reply_list)) {
76273 +               spin_unlock(&xs_state.reply_lock);
76274 +               /* XXX FIXME: Avoid synchronous wait for response here. */
76275 +               wait_event(xs_state.reply_waitq,
76276 +                          !list_empty(&xs_state.reply_list));
76277 +               spin_lock(&xs_state.reply_lock);
76278 +       }
76279 +
76280 +       msg = list_entry(xs_state.reply_list.next,
76281 +                        struct xs_stored_msg, list);
76282 +       list_del(&msg->list);
76283 +
76284 +       spin_unlock(&xs_state.reply_lock);
76285 +
76286 +       *type = msg->hdr.type;
76287 +       if (len)
76288 +               *len = msg->hdr.len;
76289 +       body = msg->u.reply.body;
76290 +
76291 +       kfree(msg);
76292 +
76293 +       return body;
76294 +}
76295 +
76296 +/* Emergency write. */
76297 +void xenbus_debug_write(const char *str, unsigned int count)
76298 +{
76299 +       struct xsd_sockmsg msg = { 0 };
76300 +
76301 +       msg.type = XS_DEBUG;
76302 +       msg.len = sizeof("print") + count + 1;
76303 +
76304 +       mutex_lock(&xs_state.request_mutex);
76305 +       xb_write(&msg, sizeof(msg));
76306 +       xb_write("print", sizeof("print"));
76307 +       xb_write(str, count);
76308 +       xb_write("", 1);
76309 +       mutex_unlock(&xs_state.request_mutex);
76310 +}
76311 +
76312 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
76313 +{
76314 +       void *ret;
76315 +       struct xsd_sockmsg req_msg = *msg;
76316 +       int err;
76317 +
76318 +       if (req_msg.type == XS_TRANSACTION_START)
76319 +               down_read(&xs_state.suspend_mutex);
76320 +
76321 +       mutex_lock(&xs_state.request_mutex);
76322 +
76323 +       err = xb_write(msg, sizeof(*msg) + msg->len);
76324 +       if (err) {
76325 +               msg->type = XS_ERROR;
76326 +               ret = ERR_PTR(err);
76327 +       } else
76328 +               ret = read_reply(&msg->type, &msg->len);
76329 +
76330 +       mutex_unlock(&xs_state.request_mutex);
76331 +
76332 +       if ((req_msg.type == XS_TRANSACTION_END) ||
76333 +           ((req_msg.type == XS_TRANSACTION_START) &&
76334 +            (msg->type == XS_ERROR)))
76335 +               up_read(&xs_state.suspend_mutex);
76336 +
76337 +       return ret;
76338 +}
76339 +
76340 +/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
76341 +static void *xs_talkv(struct xenbus_transaction t,
76342 +                     enum xsd_sockmsg_type type,
76343 +                     const struct kvec *iovec,
76344 +                     unsigned int num_vecs,
76345 +                     unsigned int *len)
76346 +{
76347 +       struct xsd_sockmsg msg;
76348 +       void *ret = NULL;
76349 +       unsigned int i;
76350 +       int err;
76351 +
76352 +       msg.tx_id = t.id;
76353 +       msg.req_id = 0;
76354 +       msg.type = type;
76355 +       msg.len = 0;
76356 +       for (i = 0; i < num_vecs; i++)
76357 +               msg.len += iovec[i].iov_len;
76358 +
76359 +       mutex_lock(&xs_state.request_mutex);
76360 +
76361 +       err = xb_write(&msg, sizeof(msg));
76362 +       if (err) {
76363 +               mutex_unlock(&xs_state.request_mutex);
76364 +               return ERR_PTR(err);
76365 +       }
76366 +
76367 +       for (i = 0; i < num_vecs; i++) {
76368 +               err = xb_write(iovec[i].iov_base, iovec[i].iov_len);;
76369 +               if (err) {
76370 +                       mutex_unlock(&xs_state.request_mutex);
76371 +                       return ERR_PTR(err);
76372 +               }
76373 +       }
76374 +
76375 +       ret = read_reply(&msg.type, len);
76376 +
76377 +       mutex_unlock(&xs_state.request_mutex);
76378 +
76379 +       if (IS_ERR(ret))
76380 +               return ret;
76381 +
76382 +       if (msg.type == XS_ERROR) {
76383 +               err = get_error(ret);
76384 +               kfree(ret);
76385 +               return ERR_PTR(-err);
76386 +       }
76387 +
76388 +       if (msg.type != type) {
76389 +               if (printk_ratelimit())
76390 +                       printk(KERN_WARNING
76391 +                              "XENBUS unexpected type [%d], expected [%d]\n",
76392 +                              msg.type, type);
76393 +               kfree(ret);
76394 +               return ERR_PTR(-EINVAL);
76395 +       }
76396 +       return ret;
76397 +}
76398 +
76399 +/* Simplified version of xs_talkv: single message. */
76400 +static void *xs_single(struct xenbus_transaction t,
76401 +                      enum xsd_sockmsg_type type,
76402 +                      const char *string,
76403 +                      unsigned int *len)
76404 +{
76405 +       struct kvec iovec;
76406 +
76407 +       iovec.iov_base = (void *)string;
76408 +       iovec.iov_len = strlen(string) + 1;
76409 +       return xs_talkv(t, type, &iovec, 1, len);
76410 +}
76411 +
76412 +/* Many commands only need an ack, don't care what it says. */
76413 +static int xs_error(char *reply)
76414 +{
76415 +       if (IS_ERR(reply))
76416 +               return PTR_ERR(reply);
76417 +       kfree(reply);
76418 +       return 0;
76419 +}
76420 +
76421 +static unsigned int count_strings(const char *strings, unsigned int len)
76422 +{
76423 +       unsigned int num;
76424 +       const char *p;
76425 +
76426 +       for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
76427 +               num++;
76428 +
76429 +       return num;
76430 +}
76431 +
76432 +/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
76433 +static char *join(const char *dir, const char *name)
76434 +{
76435 +       char *buffer;
76436 +
76437 +       if (strlen(name) == 0)
76438 +               buffer = kasprintf(GFP_KERNEL, "%s", dir);
76439 +       else
76440 +               buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
76441 +       return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
76442 +}
76443 +
76444 +static char **split(char *strings, unsigned int len, unsigned int *num)
76445 +{
76446 +       char *p, **ret;
76447 +
76448 +       /* Count the strings. */
76449 +       *num = count_strings(strings, len);
76450 +
76451 +       /* Transfer to one big alloc for easy freeing. */
76452 +       ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
76453 +       if (!ret) {
76454 +               kfree(strings);
76455 +               return ERR_PTR(-ENOMEM);
76456 +       }
76457 +       memcpy(&ret[*num], strings, len);
76458 +       kfree(strings);
76459 +
76460 +       strings = (char *)&ret[*num];
76461 +       for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
76462 +               ret[(*num)++] = p;
76463 +
76464 +       return ret;
76465 +}
76466 +
76467 +char **xenbus_directory(struct xenbus_transaction t,
76468 +                       const char *dir, const char *node, unsigned int *num)
76469 +{
76470 +       char *strings, *path;
76471 +       unsigned int len;
76472 +
76473 +       path = join(dir, node);
76474 +       if (IS_ERR(path))
76475 +               return (char **)path;
76476 +
76477 +       strings = xs_single(t, XS_DIRECTORY, path, &len);
76478 +       kfree(path);
76479 +       if (IS_ERR(strings))
76480 +               return (char **)strings;
76481 +
76482 +       return split(strings, len, num);
76483 +}
76484 +EXPORT_SYMBOL_GPL(xenbus_directory);
76485 +
76486 +/* Check if a path exists. Return 1 if it does. */
76487 +int xenbus_exists(struct xenbus_transaction t,
76488 +                 const char *dir, const char *node)
76489 +{
76490 +       char **d;
76491 +       int dir_n;
76492 +
76493 +       d = xenbus_directory(t, dir, node, &dir_n);
76494 +       if (IS_ERR(d))
76495 +               return 0;
76496 +       kfree(d);
76497 +       return 1;
76498 +}
76499 +EXPORT_SYMBOL_GPL(xenbus_exists);
76500 +
76501 +/* Get the value of a single file.
76502 + * Returns a kmalloced value: call free() on it after use.
76503 + * len indicates length in bytes.
76504 + */
76505 +void *xenbus_read(struct xenbus_transaction t,
76506 +                 const char *dir, const char *node, unsigned int *len)
76507 +{
76508 +       char *path;
76509 +       void *ret;
76510 +
76511 +       path = join(dir, node);
76512 +       if (IS_ERR(path))
76513 +               return (void *)path;
76514 +
76515 +       ret = xs_single(t, XS_READ, path, len);
76516 +       kfree(path);
76517 +       return ret;
76518 +}
76519 +EXPORT_SYMBOL_GPL(xenbus_read);
76520 +
76521 +/* Write the value of a single file.
76522 + * Returns -err on failure.
76523 + */
76524 +int xenbus_write(struct xenbus_transaction t,
76525 +                const char *dir, const char *node, const char *string)
76526 +{
76527 +       const char *path;
76528 +       struct kvec iovec[2];
76529 +       int ret;
76530 +
76531 +       path = join(dir, node);
76532 +       if (IS_ERR(path))
76533 +               return PTR_ERR(path);
76534 +
76535 +       iovec[0].iov_base = (void *)path;
76536 +       iovec[0].iov_len = strlen(path) + 1;
76537 +       iovec[1].iov_base = (void *)string;
76538 +       iovec[1].iov_len = strlen(string);
76539 +
76540 +       ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
76541 +       kfree(path);
76542 +       return ret;
76543 +}
76544 +EXPORT_SYMBOL_GPL(xenbus_write);
76545 +
76546 +/* Create a new directory. */
76547 +int xenbus_mkdir(struct xenbus_transaction t,
76548 +                const char *dir, const char *node)
76549 +{
76550 +       char *path;
76551 +       int ret;
76552 +
76553 +       path = join(dir, node);
76554 +       if (IS_ERR(path))
76555 +               return PTR_ERR(path);
76556 +
76557 +       ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
76558 +       kfree(path);
76559 +       return ret;
76560 +}
76561 +EXPORT_SYMBOL_GPL(xenbus_mkdir);
76562 +
76563 +/* Destroy a file or directory (directories must be empty). */
76564 +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
76565 +{
76566 +       char *path;
76567 +       int ret;
76568 +
76569 +       path = join(dir, node);
76570 +       if (IS_ERR(path))
76571 +               return PTR_ERR(path);
76572 +
76573 +       ret = xs_error(xs_single(t, XS_RM, path, NULL));
76574 +       kfree(path);
76575 +       return ret;
76576 +}
76577 +EXPORT_SYMBOL_GPL(xenbus_rm);
76578 +
76579 +/* Start a transaction: changes by others will not be seen during this
76580 + * transaction, and changes will not be visible to others until end.
76581 + */
76582 +int xenbus_transaction_start(struct xenbus_transaction *t)
76583 +{
76584 +       char *id_str;
76585 +
76586 +       down_read(&xs_state.suspend_mutex);
76587 +
76588 +       id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
76589 +       if (IS_ERR(id_str)) {
76590 +               up_read(&xs_state.suspend_mutex);
76591 +               return PTR_ERR(id_str);
76592 +       }
76593 +
76594 +       t->id = simple_strtoul(id_str, NULL, 0);
76595 +       kfree(id_str);
76596 +       return 0;
76597 +}
76598 +EXPORT_SYMBOL_GPL(xenbus_transaction_start);
76599 +
76600 +/* End a transaction.
76601 + * If abandon is true, transaction is discarded instead of committed.
76602 + */
76603 +int xenbus_transaction_end(struct xenbus_transaction t, int abort)
76604 +{
76605 +       char abortstr[2];
76606 +       int err;
76607 +
76608 +       if (abort)
76609 +               strcpy(abortstr, "F");
76610 +       else
76611 +               strcpy(abortstr, "T");
76612 +
76613 +       err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
76614 +
76615 +       up_read(&xs_state.suspend_mutex);
76616 +
76617 +       return err;
76618 +}
76619 +EXPORT_SYMBOL_GPL(xenbus_transaction_end);
76620 +
76621 +/* Single read and scanf: returns -errno or num scanned. */
76622 +int xenbus_scanf(struct xenbus_transaction t,
76623 +                const char *dir, const char *node, const char *fmt, ...)
76624 +{
76625 +       va_list ap;
76626 +       int ret;
76627 +       char *val;
76628 +
76629 +       val = xenbus_read(t, dir, node, NULL);
76630 +       if (IS_ERR(val))
76631 +               return PTR_ERR(val);
76632 +
76633 +       va_start(ap, fmt);
76634 +       ret = vsscanf(val, fmt, ap);
76635 +       va_end(ap);
76636 +       kfree(val);
76637 +       /* Distinctive errno. */
76638 +       if (ret == 0)
76639 +               return -ERANGE;
76640 +       return ret;
76641 +}
76642 +EXPORT_SYMBOL_GPL(xenbus_scanf);
76643 +
76644 +/* Single printf and write: returns -errno or 0. */
76645 +int xenbus_printf(struct xenbus_transaction t,
76646 +                 const char *dir, const char *node, const char *fmt, ...)
76647 +{
76648 +       va_list ap;
76649 +       int ret;
76650 +#define PRINTF_BUFFER_SIZE 4096
76651 +       char *printf_buffer;
76652 +
76653 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
76654 +       if (printf_buffer == NULL)
76655 +               return -ENOMEM;
76656 +
76657 +       va_start(ap, fmt);
76658 +       ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
76659 +       va_end(ap);
76660 +
76661 +       BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
76662 +       ret = xenbus_write(t, dir, node, printf_buffer);
76663 +
76664 +       kfree(printf_buffer);
76665 +
76666 +       return ret;
76667 +}
76668 +EXPORT_SYMBOL_GPL(xenbus_printf);
76669 +
76670 +/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
76671 +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
76672 +{
76673 +       va_list ap;
76674 +       const char *name;
76675 +       int ret = 0;
76676 +
76677 +       va_start(ap, dir);
76678 +       while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
76679 +               const char *fmt = va_arg(ap, char *);
76680 +               void *result = va_arg(ap, void *);
76681 +               char *p;
76682 +
76683 +               p = xenbus_read(t, dir, name, NULL);
76684 +               if (IS_ERR(p)) {
76685 +                       ret = PTR_ERR(p);
76686 +                       break;
76687 +               }
76688 +               if (fmt) {
76689 +                       if (sscanf(p, fmt, result) == 0)
76690 +                               ret = -EINVAL;
76691 +                       kfree(p);
76692 +               } else
76693 +                       *(char **)result = p;
76694 +       }
76695 +       va_end(ap);
76696 +       return ret;
76697 +}
76698 +EXPORT_SYMBOL_GPL(xenbus_gather);
76699 +
76700 +static int xs_watch(const char *path, const char *token)
76701 +{
76702 +       struct kvec iov[2];
76703 +
76704 +       iov[0].iov_base = (void *)path;
76705 +       iov[0].iov_len = strlen(path) + 1;
76706 +       iov[1].iov_base = (void *)token;
76707 +       iov[1].iov_len = strlen(token) + 1;
76708 +
76709 +       return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
76710 +                                ARRAY_SIZE(iov), NULL));
76711 +}
76712 +
76713 +static int xs_unwatch(const char *path, const char *token)
76714 +{
76715 +       struct kvec iov[2];
76716 +
76717 +       iov[0].iov_base = (char *)path;
76718 +       iov[0].iov_len = strlen(path) + 1;
76719 +       iov[1].iov_base = (char *)token;
76720 +       iov[1].iov_len = strlen(token) + 1;
76721 +
76722 +       return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
76723 +                                ARRAY_SIZE(iov), NULL));
76724 +}
76725 +
76726 +static struct xenbus_watch *find_watch(const char *token)
76727 +{
76728 +       struct xenbus_watch *i, *cmp;
76729 +
76730 +       cmp = (void *)simple_strtoul(token, NULL, 16);
76731 +
76732 +       list_for_each_entry(i, &watches, list)
76733 +               if (i == cmp)
76734 +                       return i;
76735 +
76736 +       return NULL;
76737 +}
76738 +
76739 +/* Register callback to watch this node. */
76740 +int register_xenbus_watch(struct xenbus_watch *watch)
76741 +{
76742 +       /* Pointer in ascii is the token. */
76743 +       char token[sizeof(watch) * 2 + 1];
76744 +       int err;
76745 +
76746 +       sprintf(token, "%lX", (long)watch);
76747 +
76748 +       down_read(&xs_state.suspend_mutex);
76749 +
76750 +       spin_lock(&watches_lock);
76751 +       BUG_ON(find_watch(token));
76752 +       list_add(&watch->list, &watches);
76753 +       spin_unlock(&watches_lock);
76754 +
76755 +       err = xs_watch(watch->node, token);
76756 +
76757 +       /* Ignore errors due to multiple registration. */
76758 +       if ((err != 0) && (err != -EEXIST)) {
76759 +               spin_lock(&watches_lock);
76760 +               list_del(&watch->list);
76761 +               spin_unlock(&watches_lock);
76762 +       }
76763 +
76764 +       up_read(&xs_state.suspend_mutex);
76765 +
76766 +       return err;
76767 +}
76768 +EXPORT_SYMBOL_GPL(register_xenbus_watch);
76769 +
76770 +void unregister_xenbus_watch(struct xenbus_watch *watch)
76771 +{
76772 +       struct xs_stored_msg *msg, *tmp;
76773 +       char token[sizeof(watch) * 2 + 1];
76774 +       int err;
76775 +
76776 +       sprintf(token, "%lX", (long)watch);
76777 +
76778 +       down_read(&xs_state.suspend_mutex);
76779 +
76780 +       spin_lock(&watches_lock);
76781 +       BUG_ON(!find_watch(token));
76782 +       list_del(&watch->list);
76783 +       spin_unlock(&watches_lock);
76784 +
76785 +       err = xs_unwatch(watch->node, token);
76786 +       if (err)
76787 +               printk(KERN_WARNING
76788 +                      "XENBUS Failed to release watch %s: %i\n",
76789 +                      watch->node, err);
76790 +
76791 +       up_read(&xs_state.suspend_mutex);
76792 +
76793 +       /* Cancel pending watch events. */
76794 +       spin_lock(&watch_events_lock);
76795 +       list_for_each_entry_safe(msg, tmp, &watch_events, list) {
76796 +               if (msg->u.watch.handle != watch)
76797 +                       continue;
76798 +               list_del(&msg->list);
76799 +               kfree(msg->u.watch.vec);
76800 +               kfree(msg);
76801 +       }
76802 +       spin_unlock(&watch_events_lock);
76803 +
76804 +       /* Flush any currently-executing callback, unless we are it. :-) */
76805 +       if (current->pid != xenwatch_pid) {
76806 +               mutex_lock(&xenwatch_mutex);
76807 +               mutex_unlock(&xenwatch_mutex);
76808 +       }
76809 +}
76810 +EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
76811 +
76812 +void xs_suspend(void)
76813 +{
76814 +       struct xenbus_watch *watch;
76815 +       char token[sizeof(watch) * 2 + 1];
76816 +
76817 +       down_write(&xs_state.suspend_mutex);
76818 +
76819 +       /* No need for watches_lock: the suspend_mutex is sufficient. */
76820 +       list_for_each_entry(watch, &watches, list) {
76821 +               sprintf(token, "%lX", (long)watch);
76822 +               xs_unwatch(watch->node, token);
76823 +       }
76824 +
76825 +       mutex_lock(&xs_state.request_mutex);
76826 +}
76827 +
76828 +void xs_resume(void)
76829 +{
76830 +       struct xenbus_watch *watch;
76831 +       char token[sizeof(watch) * 2 + 1];
76832 +
76833 +       mutex_unlock(&xs_state.request_mutex);
76834 +
76835 +       /* No need for watches_lock: the suspend_mutex is sufficient. */
76836 +       list_for_each_entry(watch, &watches, list) {
76837 +               sprintf(token, "%lX", (long)watch);
76838 +               xs_watch(watch->node, token);
76839 +       }
76840 +
76841 +       up_write(&xs_state.suspend_mutex);
76842 +}
76843 +
76844 +static int xenwatch_handle_callback(void *data)
76845 +{
76846 +       struct xs_stored_msg *msg = data;
76847 +
76848 +       msg->u.watch.handle->callback(msg->u.watch.handle,
76849 +                                     (const char **)msg->u.watch.vec,
76850 +                                     msg->u.watch.vec_size);
76851 +
76852 +       kfree(msg->u.watch.vec);
76853 +       kfree(msg);
76854 +
76855 +       /* Kill this kthread if we were spawned just for this callback. */
76856 +       if (current->pid != xenwatch_pid)
76857 +               do_exit(0);
76858 +
76859 +       return 0;
76860 +}
76861 +
76862 +static int xenwatch_thread(void *unused)
76863 +{
76864 +       struct list_head *ent;
76865 +       struct xs_stored_msg *msg;
76866 +
76867 +       for (;;) {
76868 +               wait_event_interruptible(watch_events_waitq,
76869 +                                        !list_empty(&watch_events));
76870 +
76871 +               if (kthread_should_stop())
76872 +                       break;
76873 +
76874 +               mutex_lock(&xenwatch_mutex);
76875 +
76876 +               spin_lock(&watch_events_lock);
76877 +               ent = watch_events.next;
76878 +               if (ent != &watch_events)
76879 +                       list_del(ent);
76880 +               spin_unlock(&watch_events_lock);
76881 +
76882 +               if (ent != &watch_events) {
76883 +                       msg = list_entry(ent, struct xs_stored_msg, list);
76884 +                       if (msg->u.watch.handle->flags & XBWF_new_thread)
76885 +                               kthread_run(xenwatch_handle_callback,
76886 +                                           msg, "xenwatch_cb");
76887 +                       else
76888 +                               xenwatch_handle_callback(msg);
76889 +               }
76890 +
76891 +               mutex_unlock(&xenwatch_mutex);
76892 +       }
76893 +
76894 +       return 0;
76895 +}
76896 +
76897 +static int process_msg(void)
76898 +{
76899 +       struct xs_stored_msg *msg;
76900 +       char *body;
76901 +       int err;
76902 +
76903 +       msg = kmalloc(sizeof(*msg), GFP_KERNEL);
76904 +       if (msg == NULL)
76905 +               return -ENOMEM;
76906 +
76907 +       err = xb_read(&msg->hdr, sizeof(msg->hdr));
76908 +       if (err) {
76909 +               kfree(msg);
76910 +               return err;
76911 +       }
76912 +
76913 +       body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
76914 +       if (body == NULL) {
76915 +               kfree(msg);
76916 +               return -ENOMEM;
76917 +       }
76918 +
76919 +       err = xb_read(body, msg->hdr.len);
76920 +       if (err) {
76921 +               kfree(body);
76922 +               kfree(msg);
76923 +               return err;
76924 +       }
76925 +       body[msg->hdr.len] = '\0';
76926 +
76927 +       if (msg->hdr.type == XS_WATCH_EVENT) {
76928 +               msg->u.watch.vec = split(body, msg->hdr.len,
76929 +                                        &msg->u.watch.vec_size);
76930 +               if (IS_ERR(msg->u.watch.vec)) {
76931 +                       kfree(msg);
76932 +                       return PTR_ERR(msg->u.watch.vec);
76933 +               }
76934 +
76935 +               spin_lock(&watches_lock);
76936 +               msg->u.watch.handle = find_watch(
76937 +                       msg->u.watch.vec[XS_WATCH_TOKEN]);
76938 +               if (msg->u.watch.handle != NULL) {
76939 +                       spin_lock(&watch_events_lock);
76940 +                       list_add_tail(&msg->list, &watch_events);
76941 +                       wake_up(&watch_events_waitq);
76942 +                       spin_unlock(&watch_events_lock);
76943 +               } else {
76944 +                       kfree(msg->u.watch.vec);
76945 +                       kfree(msg);
76946 +               }
76947 +               spin_unlock(&watches_lock);
76948 +       } else {
76949 +               msg->u.reply.body = body;
76950 +               spin_lock(&xs_state.reply_lock);
76951 +               list_add_tail(&msg->list, &xs_state.reply_list);
76952 +               spin_unlock(&xs_state.reply_lock);
76953 +               wake_up(&xs_state.reply_waitq);
76954 +       }
76955 +
76956 +       return 0;
76957 +}
76958 +
76959 +static int xenbus_thread(void *unused)
76960 +{
76961 +       int err;
76962 +
76963 +       for (;;) {
76964 +               err = process_msg();
76965 +               if (err)
76966 +                       printk(KERN_WARNING "XENBUS error %d while reading "
76967 +                              "message\n", err);
76968 +               if (kthread_should_stop())
76969 +                       break;
76970 +       }
76971 +
76972 +       return 0;
76973 +}
76974 +
76975 +int xs_init(void)
76976 +{
76977 +       int err;
76978 +       struct task_struct *task;
76979 +
76980 +       INIT_LIST_HEAD(&xs_state.reply_list);
76981 +       spin_lock_init(&xs_state.reply_lock);
76982 +       init_waitqueue_head(&xs_state.reply_waitq);
76983 +
76984 +       mutex_init(&xs_state.request_mutex);
76985 +       init_rwsem(&xs_state.suspend_mutex);
76986 +
76987 +       /* Initialize the shared memory rings to talk to xenstored */
76988 +       err = xb_init_comms();
76989 +       if (err)
76990 +               return err;
76991 +
76992 +       task = kthread_run(xenwatch_thread, NULL, "xenwatch");
76993 +       if (IS_ERR(task))
76994 +               return PTR_ERR(task);
76995 +       xenwatch_pid = task->pid;
76996 +
76997 +       task = kthread_run(xenbus_thread, NULL, "xenbus");
76998 +       if (IS_ERR(task))
76999 +               return PTR_ERR(task);
77000 +
77001 +       return 0;
77002 +}
77003 diff -ruNp linux-2.6.19/drivers/xen/xenoprof/xenoprofile.c linux-2.6.19-xen-3.0.4/drivers/xen/xenoprof/xenoprofile.c
77004 --- linux-2.6.19/drivers/xen/xenoprof/xenoprofile.c     1970-01-01 00:00:00.000000000 +0000
77005 +++ linux-2.6.19-xen-3.0.4/drivers/xen/xenoprof/xenoprofile.c   2007-02-02 19:10:46.000000000 +0000
77006 @@ -0,0 +1,500 @@
77007 +/**
77008 + * @file xenoprofile.c
77009 + *
77010 + * @remark Copyright 2002 OProfile authors
77011 + * @remark Read the file COPYING
77012 + *
77013 + * @author John Levon <levon@movementarian.org>
77014 + *
77015 + * Modified by Aravind Menon and Jose Renato Santos for Xen
77016 + * These modifications are:
77017 + * Copyright (C) 2005 Hewlett-Packard Co.
77018 + *
77019 + * Separated out arch-generic part
77020 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
77021 + *                    VA Linux Systems Japan K.K.
77022 + */
77023 +
77024 +#include <linux/init.h>
77025 +#include <linux/notifier.h>
77026 +#include <linux/smp.h>
77027 +#include <linux/oprofile.h>
77028 +#include <linux/sysdev.h>
77029 +#include <linux/slab.h>
77030 +#include <linux/interrupt.h>
77031 +#include <linux/vmalloc.h>
77032 +#include <asm/pgtable.h>
77033 +#include <xen/evtchn.h>
77034 +#include <xen/xenoprof.h>
77035 +#include <xen/driver_util.h>
77036 +#include <xen/interface/xen.h>
77037 +#include <xen/interface/xenoprof.h>
77038 +#include "../../../drivers/oprofile/cpu_buffer.h"
77039 +#include "../../../drivers/oprofile/event_buffer.h"
77040 +
77041 +#define MAX_XENOPROF_SAMPLES 16
77042 +
77043 +/* sample buffers shared with Xen */
77044 +xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS];
77045 +/* Shared buffer area */
77046 +struct xenoprof_shared_buffer shared_buffer;
77047 +
77048 +/* Passive sample buffers shared with Xen */
77049 +xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS];
77050 +/* Passive shared buffer area */
77051 +struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS];
77052 +
77053 +static int xenoprof_start(void);
77054 +static void xenoprof_stop(void);
77055 +
77056 +static int xenoprof_enabled = 0;
77057 +static int xenoprof_is_primary = 0;
77058 +static int active_defined;
77059 +
77060 +/* Number of buffers in shared area (one per VCPU) */
77061 +int nbuf;
77062 +/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */
77063 +int ovf_irq[NR_CPUS];
77064 +/* cpu model type string - copied from Xen memory space on XENOPROF_init command */
77065 +char cpu_type[XENOPROF_CPU_TYPE_SIZE];
77066 +
77067 +#ifdef CONFIG_PM
77068 +
77069 +static int xenoprof_suspend(struct sys_device * dev, pm_message_t state)
77070 +{
77071 +       if (xenoprof_enabled == 1)
77072 +               xenoprof_stop();
77073 +       return 0;
77074 +}
77075 +
77076 +
77077 +static int xenoprof_resume(struct sys_device * dev)
77078 +{
77079 +       if (xenoprof_enabled == 1)
77080 +               xenoprof_start();
77081 +       return 0;
77082 +}
77083 +
77084 +
77085 +static struct sysdev_class oprofile_sysclass = {
77086 +       set_kset_name("oprofile"),
77087 +       .resume         = xenoprof_resume,
77088 +       .suspend        = xenoprof_suspend
77089 +};
77090 +
77091 +
77092 +static struct sys_device device_oprofile = {
77093 +       .id     = 0,
77094 +       .cls    = &oprofile_sysclass,
77095 +};
77096 +
77097 +
77098 +static int __init init_driverfs(void)
77099 +{
77100 +       int error;
77101 +       if (!(error = sysdev_class_register(&oprofile_sysclass)))
77102 +               error = sysdev_register(&device_oprofile);
77103 +       return error;
77104 +}
77105 +
77106 +
77107 +static void exit_driverfs(void)
77108 +{
77109 +       sysdev_unregister(&device_oprofile);
77110 +       sysdev_class_unregister(&oprofile_sysclass);
77111 +}
77112 +
77113 +#else
77114 +#define init_driverfs() do { } while (0)
77115 +#define exit_driverfs() do { } while (0)
77116 +#endif /* CONFIG_PM */
77117 +
77118 +unsigned long long oprofile_samples = 0;
77119 +unsigned long long p_oprofile_samples = 0;
77120 +
77121 +unsigned int pdomains;
77122 +struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
77123 +
77124 +static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
77125 +{
77126 +       int head, tail, size;
77127 +
77128 +       head = buf->event_head;
77129 +       tail = buf->event_tail;
77130 +       size = buf->event_size;
77131 +
77132 +       if (tail > head) {
77133 +               while (tail < size) {
77134 +                       oprofile_add_pc(buf->event_log[tail].eip,
77135 +                                       buf->event_log[tail].mode,
77136 +                                       buf->event_log[tail].event);
77137 +                       if (!is_passive)
77138 +                               oprofile_samples++;
77139 +                       else
77140 +                               p_oprofile_samples++;
77141 +                       tail++;
77142 +               }
77143 +               tail = 0;
77144 +       }
77145 +       while (tail < head) {
77146 +               oprofile_add_pc(buf->event_log[tail].eip,
77147 +                               buf->event_log[tail].mode,
77148 +                               buf->event_log[tail].event);
77149 +               if (!is_passive)
77150 +                       oprofile_samples++;
77151 +               else
77152 +                       p_oprofile_samples++;
77153 +               tail++;
77154 +       }
77155 +
77156 +       buf->event_tail = tail;
77157 +}
77158 +
77159 +static void xenoprof_handle_passive(void)
77160 +{
77161 +       int i, j;
77162 +       int flag_domain, flag_switch = 0;
77163 +       
77164 +       for (i = 0; i < pdomains; i++) {
77165 +               flag_domain = 0;
77166 +               for (j = 0; j < passive_domains[i].nbuf; j++) {
77167 +                       xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
77168 +                       if (buf->event_head == buf->event_tail)
77169 +                               continue;
77170 +                       if (!flag_domain) {
77171 +                               if (!oprofile_add_domain_switch(passive_domains[i].
77172 +                                                               domain_id))
77173 +                                       goto done;
77174 +                               flag_domain = 1;
77175 +                       }
77176 +                       xenoprof_add_pc(buf, 1);
77177 +                       flag_switch = 1;
77178 +               }
77179 +       }
77180 +done:
77181 +       if (flag_switch)
77182 +               oprofile_add_domain_switch(COORDINATOR_DOMAIN);
77183 +}
77184 +
77185 +static irqreturn_t 
77186 +xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
77187 +{
77188 +       struct xenoprof_buf * buf;
77189 +       int cpu;
77190 +       static unsigned long flag;
77191 +
77192 +       cpu = smp_processor_id();
77193 +       buf = xenoprof_buf[cpu];
77194 +
77195 +       xenoprof_add_pc(buf, 0);
77196 +
77197 +       if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) {
77198 +               xenoprof_handle_passive();
77199 +               smp_mb__before_clear_bit();
77200 +               clear_bit(0, &flag);
77201 +       }
77202 +
77203 +       return IRQ_HANDLED;
77204 +}
77205 +
77206 +
77207 +static void unbind_virq(void)
77208 +{
77209 +       int i;
77210 +
77211 +       for_each_online_cpu(i) {
77212 +               if (ovf_irq[i] >= 0) {
77213 +                       unbind_from_irqhandler(ovf_irq[i], NULL);
77214 +                       ovf_irq[i] = -1;
77215 +               }
77216 +       }
77217 +}
77218 +
77219 +
77220 +static int bind_virq(void)
77221 +{
77222 +       int i, result;
77223 +
77224 +       for_each_online_cpu(i) {
77225 +               result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
77226 +                                                i,
77227 +                                                xenoprof_ovf_interrupt,
77228 +                                                SA_INTERRUPT,
77229 +                                                "xenoprof",
77230 +                                                NULL);
77231 +
77232 +               if (result < 0) {
77233 +                       unbind_virq();
77234 +                       return result;
77235 +               }
77236 +
77237 +               ovf_irq[i] = result;
77238 +       }
77239 +               
77240 +       return 0;
77241 +}
77242 +
77243 +
77244 +static void unmap_passive_list(void)
77245 +{
77246 +       int i;
77247 +       for (i = 0; i < pdomains; i++)
77248 +               xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
77249 +       pdomains = 0;
77250 +}
77251 +
77252 +
77253 +static int map_xenoprof_buffer(int max_samples)
77254 +{
77255 +       struct xenoprof_get_buffer get_buffer;
77256 +       struct xenoprof_buf *buf;
77257 +       int ret, i;
77258 +
77259 +       if ( shared_buffer.buffer )
77260 +               return 0;
77261 +
77262 +       get_buffer.max_samples = max_samples;
77263 +       ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer);
77264 +       if (ret)
77265 +               return ret;
77266 +       nbuf = get_buffer.nbuf;
77267 +
77268 +       for (i=0; i< nbuf; i++) {
77269 +               buf = (struct xenoprof_buf*) 
77270 +                       &shared_buffer.buffer[i * get_buffer.bufsize];
77271 +               BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
77272 +               xenoprof_buf[buf->vcpu_id] = buf;
77273 +       }
77274 +
77275 +       return 0;
77276 +}
77277 +
77278 +
77279 +static int xenoprof_setup(void)
77280 +{
77281 +       int ret;
77282 +
77283 +       if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) )
77284 +               return ret;
77285 +
77286 +       if ( (ret = bind_virq()) )
77287 +               return ret;
77288 +
77289 +       if (xenoprof_is_primary) {
77290 +               /* Define dom0 as an active domain if not done yet */
77291 +               if (!active_defined) {
77292 +                       domid_t domid;
77293 +                       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
77294 +                       if (ret)
77295 +                               goto err;
77296 +                       domid = 0;
77297 +                       ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
77298 +                       if (ret)
77299 +                               goto err;
77300 +                       active_defined = 1;
77301 +               }
77302 +
77303 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
77304 +               if (ret)
77305 +                       goto err;
77306 +               xenoprof_arch_counter();
77307 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
77308 +
77309 +               if (ret)
77310 +                       goto err;
77311 +       }
77312 +
77313 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
77314 +       if (ret)
77315 +               goto err;
77316 +
77317 +       xenoprof_enabled = 1;
77318 +       return 0;
77319 + err:
77320 +       unbind_virq();
77321 +       return ret;
77322 +}
77323 +
77324 +
77325 +static void xenoprof_shutdown(void)
77326 +{
77327 +       xenoprof_enabled = 0;
77328 +
77329 +       HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL);
77330 +
77331 +       if (xenoprof_is_primary) {
77332 +               HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL);
77333 +               active_defined = 0;
77334 +       }
77335 +
77336 +       unbind_virq();
77337 +
77338 +       xenoprof_arch_unmap_shared_buffer(&shared_buffer);
77339 +       if (xenoprof_is_primary)
77340 +               unmap_passive_list();
77341 +}
77342 +
77343 +
77344 +static int xenoprof_start(void)
77345 +{
77346 +       int ret = 0;
77347 +
77348 +       if (xenoprof_is_primary)
77349 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
77350 +       if (!ret)
77351 +               xenoprof_arch_start();
77352 +       return ret;
77353 +}
77354 +
77355 +
77356 +static void xenoprof_stop(void)
77357 +{
77358 +       if (xenoprof_is_primary)
77359 +               HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL);
77360 +       xenoprof_arch_stop();
77361 +}
77362 +
77363 +
77364 +static int xenoprof_set_active(int * active_domains,
77365 +                              unsigned int adomains)
77366 +{
77367 +       int ret = 0;
77368 +       int i;
77369 +       int set_dom0 = 0;
77370 +       domid_t domid;
77371 +
77372 +       if (!xenoprof_is_primary)
77373 +               return 0;
77374 +
77375 +       if (adomains > MAX_OPROF_DOMAINS)
77376 +               return -E2BIG;
77377 +
77378 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
77379 +       if (ret)
77380 +               return ret;
77381 +
77382 +       for (i=0; i<adomains; i++) {
77383 +               domid = active_domains[i];
77384 +               if (domid != active_domains[i]) {
77385 +                       ret = -EINVAL;
77386 +                       goto out;
77387 +               }
77388 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
77389 +               if (ret)
77390 +                       goto out;
77391 +               if (active_domains[i] == 0)
77392 +                       set_dom0 = 1;
77393 +       }
77394 +       /* dom0 must always be active but may not be in the list */ 
77395 +       if (!set_dom0) {
77396 +               domid = 0;
77397 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
77398 +       }
77399 +
77400 +out:
77401 +       if (ret)
77402 +               HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
77403 +       active_defined = !ret;
77404 +       return ret;
77405 +}
77406 +
77407 +static int xenoprof_set_passive(int * p_domains,
77408 +                                unsigned int pdoms)
77409 +{
77410 +       int ret;
77411 +       int i, j;
77412 +       struct xenoprof_buf *buf;
77413 +
77414 +       if (!xenoprof_is_primary)
77415 +               return 0;
77416 +
77417 +       if (pdoms > MAX_OPROF_DOMAINS)
77418 +               return -E2BIG;
77419 +
77420 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
77421 +       if (ret)
77422 +               return ret;
77423 +       unmap_passive_list();
77424 +
77425 +       for (i = 0; i < pdoms; i++) {
77426 +               passive_domains[i].domain_id = p_domains[i];
77427 +               passive_domains[i].max_samples = 2048;
77428 +               ret = xenoprof_arch_set_passive(&passive_domains[i],
77429 +                                               &p_shared_buffer[i]);
77430 +               if (ret)
77431 +                       goto out;
77432 +               for (j = 0; j < passive_domains[i].nbuf; j++) {
77433 +                       buf = (struct xenoprof_buf *)
77434 +                               &p_shared_buffer[i].buffer[j * passive_domains[i].bufsize];
77435 +                       BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
77436 +                       p_xenoprof_buf[i][buf->vcpu_id] = buf;
77437 +               }
77438 +       }
77439 +
77440 +       pdomains = pdoms;
77441 +       return 0;
77442 +
77443 +out:
77444 +       for (j = 0; j < i; j++)
77445 +               xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
77446 +
77447 +       return ret;
77448 +}
77449 +
77450 +struct oprofile_operations xenoprof_ops = {
77451 +#ifdef HAVE_XENOPROF_CREATE_FILES
77452 +       .create_files   = xenoprof_create_files,
77453 +#endif
77454 +       .set_active     = xenoprof_set_active,
77455 +       .set_passive    = xenoprof_set_passive,
77456 +       .setup          = xenoprof_setup,
77457 +       .shutdown       = xenoprof_shutdown,
77458 +       .start          = xenoprof_start,
77459 +       .stop           = xenoprof_stop
77460 +};
77461 +
77462 +
77463 +/* in order to get driverfs right */
77464 +static int using_xenoprof;
77465 +
77466 +int __init xenoprofile_init(struct oprofile_operations * ops)
77467 +{
77468 +       struct xenoprof_init init;
77469 +       int ret, i;
77470 +
77471 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
77472 +       if (!ret) {
77473 +               xenoprof_arch_init_counter(&init);
77474 +               xenoprof_is_primary = init.is_primary;
77475 +
77476 +               /*  cpu_type is detected by Xen */
77477 +               cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0;
77478 +               strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1);
77479 +               xenoprof_ops.cpu_type = cpu_type;
77480 +
77481 +               init_driverfs();
77482 +               using_xenoprof = 1;
77483 +               *ops = xenoprof_ops;
77484 +
77485 +               for (i=0; i<NR_CPUS; i++)
77486 +                       ovf_irq[i] = -1;
77487 +
77488 +               active_defined = 0;
77489 +       }
77490 +       printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n",
77491 +              __func__, ret, init.num_events, xenoprof_is_primary);
77492 +       return ret;
77493 +}
77494 +
77495 +
77496 +void xenoprofile_exit(void)
77497 +{
77498 +       if (using_xenoprof)
77499 +               exit_driverfs();
77500 +
77501 +       xenoprof_arch_unmap_shared_buffer(&shared_buffer);
77502 +       if (xenoprof_is_primary) {
77503 +               unmap_passive_list();
77504 +               HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL);
77505 +        }
77506 +}
77507 diff -ruNp linux-2.6.19/fs/Kconfig linux-2.6.19-xen-3.0.4/fs/Kconfig
77508 --- linux-2.6.19/fs/Kconfig     2006-11-29 21:57:37.000000000 +0000
77509 +++ linux-2.6.19-xen-3.0.4/fs/Kconfig   2007-02-02 19:10:46.000000000 +0000
77510 @@ -1009,6 +1009,7 @@ config TMPFS_POSIX_ACL
77511  config HUGETLBFS
77512         bool "HugeTLB file system support"
77513         depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
77514 +       depends !XEN
77515         help
77516           hugetlbfs is a filesystem backing for HugeTLB pages, based on
77517           ramfs. For architectures that support it, say Y here and read
77518 diff -ruNp linux-2.6.19/include/asm-i386/apic.h linux-2.6.19-xen-3.0.4/include/asm-i386/apic.h
77519 --- linux-2.6.19/include/asm-i386/apic.h        2006-11-29 21:57:37.000000000 +0000
77520 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/apic.h      2007-02-02 19:10:54.000000000 +0000
77521 @@ -107,10 +107,12 @@ extern void enable_APIC_timer(void);
77522  
77523  extern void enable_NMI_through_LVT0 (void * dummy);
77524  
77525 +#ifndef CONFIG_XEN
77526  void smp_send_timer_broadcast_ipi(void);
77527  void switch_APIC_timer_to_ipi(void *cpumask);
77528  void switch_ipi_to_APIC_timer(void *cpumask);
77529  #define ARCH_APICTIMER_STOPS_ON_C3     1
77530 +#endif
77531  
77532  extern int timer_over_8254;
77533  
77534 diff -ruNp linux-2.6.19/include/asm-i386/kexec.h linux-2.6.19-xen-3.0.4/include/asm-i386/kexec.h
77535 --- linux-2.6.19/include/asm-i386/kexec.h       2006-11-29 21:57:37.000000000 +0000
77536 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/kexec.h     2007-02-02 19:10:55.000000000 +0000
77537 @@ -98,6 +98,20 @@ relocate_kernel(unsigned long indirectio
77538                 unsigned long start_address,
77539                 unsigned int has_pae) ATTRIB_NORET;
77540  
77541 +
77542 +/* Under Xen we need to work with machine addresses. These macros give the
77543 + * machine address of a certain page to the generic kexec code instead of 
77544 + * the pseudo physical address which would be given by the default macros.
77545 + */
77546 +
77547 +#ifdef CONFIG_XEN
77548 +#define KEXEC_ARCH_HAS_PAGE_MACROS
77549 +#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
77550 +#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
77551 +#define kexec_virt_to_phys(addr) virt_to_machine(addr)
77552 +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
77553 +#endif
77554 +
77555  #endif /* __ASSEMBLY__ */
77556  
77557  #endif /* _I386_KEXEC_H */
77558 diff -ruNp linux-2.6.19/include/asm-i386/mach-default/mach_traps.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-default/mach_traps.h
77559 --- linux-2.6.19/include/asm-i386/mach-default/mach_traps.h     2006-11-29 21:57:37.000000000 +0000
77560 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-default/mach_traps.h   2007-02-02 19:10:55.000000000 +0000
77561 @@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
77562         outb(reason, 0x61);
77563  }
77564  
77565 +static inline void clear_io_check_error(unsigned char reason)
77566 +{
77567 +       unsigned long i;
77568 +
77569 +       reason = (reason & 0xf) | 8;
77570 +       outb(reason, 0x61);
77571 +       i = 2000;
77572 +       while (--i) udelay(1000);
77573 +       reason &= ~8;
77574 +       outb(reason, 0x61);
77575 +}
77576 +
77577  static inline unsigned char get_nmi_reason(void)
77578  {
77579         return inb(0x61);
77580 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/agp.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/agp.h
77581 --- linux-2.6.19/include/asm-i386/mach-xen/asm/agp.h    1970-01-01 00:00:00.000000000 +0000
77582 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/agp.h  2007-02-02 19:10:55.000000000 +0000
77583 @@ -0,0 +1,37 @@
77584 +#ifndef AGP_H
77585 +#define AGP_H 1
77586 +
77587 +#include <asm/pgtable.h>
77588 +#include <asm/cacheflush.h>
77589 +#include <asm/system.h>
77590 +
77591 +/* 
77592 + * Functions to keep the agpgart mappings coherent with the MMU.
77593 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
77594 + * mapped uncacheable. Make sure there are no conflicting mappings
77595 + * with different cachability attributes for the same page. This avoids
77596 + * data corruption on some CPUs.
77597 + */
77598 +
77599 +int map_page_into_agp(struct page *page);
77600 +int unmap_page_from_agp(struct page *page);
77601 +#define flush_agp_mappings() global_flush_tlb()
77602 +
77603 +/* Could use CLFLUSH here if the cpu supports it. But then it would
77604 +   need to be called for each cacheline of the whole page so it may not be 
77605 +   worth it. Would need a page for it. */
77606 +#define flush_agp_cache() wbinvd()
77607 +
77608 +/* Convert a physical address to an address suitable for the GART. */
77609 +#define phys_to_gart(x) phys_to_machine(x)
77610 +#define gart_to_phys(x) machine_to_phys(x)
77611 +
77612 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
77613 +#define alloc_gatt_pages(order)        ({                                          \
77614 +       char *_t; dma_addr_t _d;                                            \
77615 +       _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
77616 +       _t; })
77617 +#define free_gatt_pages(table, order)  \
77618 +       dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
77619 +
77620 +#endif
77621 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/desc.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/desc.h
77622 --- linux-2.6.19/include/asm-i386/mach-xen/asm/desc.h   1970-01-01 00:00:00.000000000 +0000
77623 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/desc.h 2007-02-02 19:10:55.000000000 +0000
77624 @@ -0,0 +1,203 @@
77625 +#ifndef __ARCH_DESC_H
77626 +#define __ARCH_DESC_H
77627 +
77628 +#include <asm/ldt.h>
77629 +#include <asm/segment.h>
77630 +
77631 +#define CPU_16BIT_STACK_SIZE 1024
77632 +
77633 +#ifndef __ASSEMBLY__
77634 +
77635 +#include <linux/preempt.h>
77636 +#include <linux/smp.h>
77637 +#include <linux/percpu.h>
77638 +
77639 +#include <asm/mmu.h>
77640 +
77641 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
77642 +
77643 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
77644 +
77645 +struct Xgt_desc_struct {
77646 +       unsigned short size;
77647 +       unsigned long address __attribute__((packed));
77648 +       unsigned short pad;
77649 +} __attribute__ ((packed));
77650 +
77651 +extern struct Xgt_desc_struct idt_descr;
77652 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
77653 +
77654 +
77655 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
77656 +{
77657 +       return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
77658 +}
77659 +
77660 +/*
77661 + * This is the ldt that every process will get unless we need
77662 + * something other than this.
77663 + */
77664 +extern struct desc_struct default_ldt[];
77665 +extern struct desc_struct idt_table[];
77666 +extern void set_intr_gate(unsigned int irq, void * addr);
77667 +
77668 +static inline void pack_descriptor(__u32 *a, __u32 *b,
77669 +       unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
77670 +{
77671 +       *a = ((base & 0xffff) << 16) | (limit & 0xffff);
77672 +       *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
77673 +               (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
77674 +}
77675 +
77676 +static inline void pack_gate(__u32 *a, __u32 *b,
77677 +       unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
77678 +{
77679 +       *a = (seg << 16) | (base & 0xffff);
77680 +       *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
77681 +}
77682 +
77683 +#define DESCTYPE_LDT   0x82    /* present, system, DPL-0, LDT */
77684 +#define DESCTYPE_TSS   0x89    /* present, system, DPL-0, 32-bit TSS */
77685 +#define DESCTYPE_TASK  0x85    /* present, system, DPL-0, task gate */
77686 +#define DESCTYPE_INT   0x8e    /* present, system, DPL-0, interrupt gate */
77687 +#define DESCTYPE_TRAP  0x8f    /* present, system, DPL-0, trap gate */
77688 +#define DESCTYPE_DPL3  0x60    /* DPL-3 */
77689 +#define DESCTYPE_S     0x10    /* !system */
77690 +
77691 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
77692 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
77693 +
77694 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
77695 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
77696 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
77697 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
77698 +
77699 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
77700 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
77701 +#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
77702 +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
77703 +
77704 +#if TLS_SIZE != 24
77705 +# error update this code.
77706 +#endif
77707 +
77708 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
77709 +{
77710 +#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
77711 +       C(0); C(1); C(2);
77712 +#undef C
77713 +}
77714 +
77715 +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
77716 +{
77717 +       __u32 *lp = (__u32 *)((char *)dt + entry*8);
77718 +       *lp = entry_a;
77719 +       *(lp+1) = entry_b;
77720 +}
77721 +
77722 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
77723 +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
77724 +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
77725 +
77726 +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
77727 +{
77728 +       __u32 a, b;
77729 +       pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
77730 +       write_idt_entry(idt_table, gate, a, b);
77731 +}
77732 +
77733 +#ifndef CONFIG_X86_NO_TSS
77734 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
77735 +{
77736 +       __u32 a, b;
77737 +       pack_descriptor(&a, &b, (unsigned long)addr,
77738 +                       offsetof(struct tss_struct, __cacheline_filler) - 1,
77739 +                       DESCTYPE_TSS, 0);
77740 +       write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
77741 +}
77742 +#endif
77743 +
77744 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
77745 +{
77746 +       __u32 a, b;
77747 +       pack_descriptor(&a, &b, (unsigned long)addr,
77748 +                       entries * sizeof(struct desc_struct) - 1,
77749 +                       DESCTYPE_LDT, 0);
77750 +       write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
77751 +}
77752 +
77753 +#ifndef CONFIG_X86_NO_TSS
77754 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
77755 +#endif
77756 +
77757 +#define LDT_entry_a(info) \
77758 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
77759 +
77760 +#define LDT_entry_b(info) \
77761 +       (((info)->base_addr & 0xff000000) | \
77762 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
77763 +       ((info)->limit & 0xf0000) | \
77764 +       (((info)->read_exec_only ^ 1) << 9) | \
77765 +       ((info)->contents << 10) | \
77766 +       (((info)->seg_not_present ^ 1) << 15) | \
77767 +       ((info)->seg_32bit << 22) | \
77768 +       ((info)->limit_in_pages << 23) | \
77769 +       ((info)->useable << 20) | \
77770 +       0x7000)
77771 +
77772 +#define LDT_empty(info) (\
77773 +       (info)->base_addr       == 0    && \
77774 +       (info)->limit           == 0    && \
77775 +       (info)->contents        == 0    && \
77776 +       (info)->read_exec_only  == 1    && \
77777 +       (info)->seg_32bit       == 0    && \
77778 +       (info)->limit_in_pages  == 0    && \
77779 +       (info)->seg_not_present == 1    && \
77780 +       (info)->useable         == 0    )
77781 +
77782 +static inline void clear_LDT(void)
77783 +{
77784 +       int cpu = get_cpu();
77785 +
77786 +       /*
77787 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
77788 +        * it slows down context switching. Noone uses it anyway.
77789 +        */
77790 +       cpu = cpu;              /* XXX avoid compiler warning */
77791 +       xen_set_ldt(0UL, 0);
77792 +       put_cpu();
77793 +}
77794 +
77795 +/*
77796 + * load one particular LDT into the current CPU
77797 + */
77798 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
77799 +{
77800 +       void *segments = pc->ldt;
77801 +       int count = pc->size;
77802 +
77803 +       if (likely(!count))
77804 +               segments = NULL;
77805 +
77806 +       xen_set_ldt((unsigned long)segments, count);
77807 +}
77808 +
77809 +static inline void load_LDT(mm_context_t *pc)
77810 +{
77811 +       int cpu = get_cpu();
77812 +       load_LDT_nolock(pc, cpu);
77813 +       put_cpu();
77814 +}
77815 +
77816 +static inline unsigned long get_desc_base(unsigned long *desc)
77817 +{
77818 +       unsigned long base;
77819 +       base = ((desc[0] >> 16)  & 0x0000ffff) |
77820 +               ((desc[1] << 16) & 0x00ff0000) |
77821 +               (desc[1] & 0xff000000);
77822 +       return base;
77823 +}
77824 +
77825 +#endif /* !__ASSEMBLY__ */
77826 +
77827 +#endif
77828 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/dma-mapping.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/dma-mapping.h
77829 --- linux-2.6.19/include/asm-i386/mach-xen/asm/dma-mapping.h    1970-01-01 00:00:00.000000000 +0000
77830 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/dma-mapping.h  2007-02-02 19:10:55.000000000 +0000
77831 @@ -0,0 +1,151 @@
77832 +#ifndef _ASM_I386_DMA_MAPPING_H
77833 +#define _ASM_I386_DMA_MAPPING_H
77834 +
77835 +#include <linux/mm.h>
77836 +
77837 +#include <asm/cache.h>
77838 +#include <asm/io.h>
77839 +#include <asm/scatterlist.h>
77840 +#include <asm/bug.h>
77841 +#include <asm/swiotlb.h>
77842 +
77843 +static inline int
77844 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
77845 +{
77846 +       dma_addr_t mask = 0xffffffff;
77847 +       /* If the device has a mask, use it, otherwise default to 32 bits */
77848 +       if (hwdev && hwdev->dma_mask)
77849 +               mask = *hwdev->dma_mask;
77850 +       return (addr & ~mask) != 0;
77851 +}
77852 +
77853 +static inline int
77854 +range_straddles_page_boundary(void *p, size_t size)
77855 +{
77856 +       extern unsigned long *contiguous_bitmap;
77857 +       return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
77858 +               !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
77859 +}
77860 +
77861 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
77862 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
77863 +
77864 +void *dma_alloc_coherent(struct device *dev, size_t size,
77865 +                          dma_addr_t *dma_handle, gfp_t flag);
77866 +
77867 +void dma_free_coherent(struct device *dev, size_t size,
77868 +                        void *vaddr, dma_addr_t dma_handle);
77869 +
77870 +extern dma_addr_t
77871 +dma_map_single(struct device *dev, void *ptr, size_t size,
77872 +              enum dma_data_direction direction);
77873 +
77874 +extern void
77875 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
77876 +                enum dma_data_direction direction);
77877 +
77878 +extern int
77879 +dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
77880 +          enum dma_data_direction direction);
77881 +
77882 +extern dma_addr_t
77883 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
77884 +            size_t size, enum dma_data_direction direction);
77885 +
77886 +extern void
77887 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
77888 +              enum dma_data_direction direction);
77889 +
77890 +extern void
77891 +dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
77892 +            enum dma_data_direction direction);
77893 +
77894 +extern void
77895 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
77896 +                       enum dma_data_direction direction);
77897 +
77898 +extern void
77899 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
77900 +                           enum dma_data_direction direction);
77901 +
77902 +static inline void
77903 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
77904 +                             unsigned long offset, size_t size,
77905 +                             enum dma_data_direction direction)
77906 +{
77907 +       dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
77908 +}
77909 +
77910 +static inline void
77911 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
77912 +                                unsigned long offset, size_t size,
77913 +                                enum dma_data_direction direction)
77914 +{
77915 +       dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
77916 +}
77917 +
77918 +static inline void
77919 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
77920 +                   enum dma_data_direction direction)
77921 +{
77922 +       if (swiotlb)
77923 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
77924 +       flush_write_buffers();
77925 +}
77926 +
77927 +static inline void
77928 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
77929 +                   enum dma_data_direction direction)
77930 +{
77931 +       if (swiotlb)
77932 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
77933 +       flush_write_buffers();
77934 +}
77935 +
77936 +extern int
77937 +dma_mapping_error(dma_addr_t dma_addr);
77938 +
77939 +extern int
77940 +dma_supported(struct device *dev, u64 mask);
77941 +
77942 +static inline int
77943 +dma_set_mask(struct device *dev, u64 mask)
77944 +{
77945 +       if(!dev->dma_mask || !dma_supported(dev, mask))
77946 +               return -EIO;
77947 +
77948 +       *dev->dma_mask = mask;
77949 +
77950 +       return 0;
77951 +}
77952 +
77953 +static inline int
77954 +dma_get_cache_alignment(void)
77955 +{
77956 +       /* no easy way to get cache size on all x86, so return the
77957 +        * maximum possible, to be safe */
77958 +       return (1 << INTERNODE_CACHE_SHIFT);
77959 +}
77960 +
77961 +#define dma_is_consistent(d)   (1)
77962 +
77963 +static inline void
77964 +dma_cache_sync(void *vaddr, size_t size,
77965 +              enum dma_data_direction direction)
77966 +{
77967 +       flush_write_buffers();
77968 +}
77969 +
77970 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
77971 +extern int
77972 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
77973 +                           dma_addr_t device_addr, size_t size, int flags);
77974 +
77975 +extern void
77976 +dma_release_declared_memory(struct device *dev);
77977 +
77978 +extern void *
77979 +dma_mark_declared_memory_occupied(struct device *dev,
77980 +                                 dma_addr_t device_addr, size_t size);
77981 +
77982 +#endif
77983 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/fixmap.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/fixmap.h
77984 --- linux-2.6.19/include/asm-i386/mach-xen/asm/fixmap.h 1970-01-01 00:00:00.000000000 +0000
77985 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/fixmap.h       2007-02-02 19:10:55.000000000 +0000
77986 @@ -0,0 +1,160 @@
77987 +/*
77988 + * fixmap.h: compile-time virtual memory allocation
77989 + *
77990 + * This file is subject to the terms and conditions of the GNU General Public
77991 + * License.  See the file "COPYING" in the main directory of this archive
77992 + * for more details.
77993 + *
77994 + * Copyright (C) 1998 Ingo Molnar
77995 + *
77996 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
77997 + */
77998 +
77999 +#ifndef _ASM_FIXMAP_H
78000 +#define _ASM_FIXMAP_H
78001 +
78002 +
78003 +/* used by vmalloc.c, vsyscall.lds.S.
78004 + *
78005 + * Leave one empty page between vmalloc'ed areas and
78006 + * the start of the fixmap.
78007 + */
78008 +#ifndef CONFIG_COMPAT_VDSO
78009 +extern unsigned long __FIXADDR_TOP;
78010 +#else
78011 +#define __FIXADDR_TOP  0xfffff000
78012 +#endif
78013 +
78014 +#ifndef __ASSEMBLY__
78015 +#include <linux/kernel.h>
78016 +#include <asm/acpi.h>
78017 +#include <asm/apicdef.h>
78018 +#include <asm/page.h>
78019 +#ifdef CONFIG_HIGHMEM
78020 +#include <linux/threads.h>
78021 +#include <asm/kmap_types.h>
78022 +#endif
78023 +
78024 +/*
78025 + * Here we define all the compile-time 'special' virtual
78026 + * addresses. The point is to have a constant address at
78027 + * compile time, but to set the physical address only
78028 + * in the boot process. We allocate these special addresses
78029 + * from the end of virtual memory (0xfffff000) backwards.
78030 + * Also this lets us do fail-safe vmalloc(), we
78031 + * can guarantee that these special addresses and
78032 + * vmalloc()-ed addresses never overlap.
78033 + *
78034 + * these 'compile-time allocated' memory buffers are
78035 + * fixed-size 4k pages. (or larger if used with an increment
78036 + * highger than 1) use fixmap_set(idx,phys) to associate
78037 + * physical memory with fixmap indices.
78038 + *
78039 + * TLB entries of such buffers will not be flushed across
78040 + * task switches.
78041 + */
78042 +enum fixed_addresses {
78043 +       FIX_HOLE,
78044 +       FIX_VDSO,
78045 +#ifdef CONFIG_X86_LOCAL_APIC
78046 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
78047 +#endif
78048 +#ifdef CONFIG_X86_IO_APIC
78049 +       FIX_IO_APIC_BASE_0,
78050 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
78051 +#endif
78052 +#ifdef CONFIG_X86_VISWS_APIC
78053 +       FIX_CO_CPU,     /* Cobalt timer */
78054 +       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */ 
78055 +       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
78056 +       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
78057 +#endif
78058 +#ifdef CONFIG_X86_F00F_BUG
78059 +       FIX_F00F_IDT,   /* Virtual mapping for IDT */
78060 +#endif
78061 +#ifdef CONFIG_X86_CYCLONE_TIMER
78062 +       FIX_CYCLONE_TIMER, /*cyclone timer register*/
78063 +#endif 
78064 +#ifdef CONFIG_HIGHMEM
78065 +       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
78066 +       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
78067 +#endif
78068 +#ifdef CONFIG_ACPI
78069 +       FIX_ACPI_BEGIN,
78070 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
78071 +#endif
78072 +#ifdef CONFIG_PCI_MMCONFIG
78073 +       FIX_PCIE_MCFG,
78074 +#endif
78075 +       FIX_SHARED_INFO,
78076 +#define NR_FIX_ISAMAPS 256
78077 +       FIX_ISAMAP_END,
78078 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
78079 +       __end_of_permanent_fixed_addresses,
78080 +       /* temporary boot-time mappings, used before ioremap() is functional */
78081 +#define NR_FIX_BTMAPS  16
78082 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
78083 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
78084 +       FIX_WP_TEST,
78085 +       __end_of_fixed_addresses
78086 +};
78087 +
78088 +extern void __set_fixmap(enum fixed_addresses idx,
78089 +                                       maddr_t phys, pgprot_t flags);
78090 +
78091 +extern void reserve_top_address(unsigned long reserve);
78092 +extern void set_fixaddr_top(void);
78093 +
78094 +#define set_fixmap(idx, phys) \
78095 +               __set_fixmap(idx, phys, PAGE_KERNEL)
78096 +/*
78097 + * Some hardware wants to get fixmapped without caching.
78098 + */
78099 +#define set_fixmap_nocache(idx, phys) \
78100 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
78101 +
78102 +#define clear_fixmap(idx) \
78103 +               __set_fixmap(idx, 0, __pgprot(0))
78104 +
78105 +#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
78106 +
78107 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
78108 +#define __FIXADDR_BOOT_SIZE    (__end_of_fixed_addresses << PAGE_SHIFT)
78109 +#define FIXADDR_START          (FIXADDR_TOP - __FIXADDR_SIZE)
78110 +#define FIXADDR_BOOT_START     (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
78111 +
78112 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
78113 +#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
78114 +
78115 +extern void __this_fixmap_does_not_exist(void);
78116 +
78117 +/*
78118 + * 'index to address' translation. If anyone tries to use the idx
78119 + * directly without tranlation, we catch the bug with a NULL-deference
78120 + * kernel oops. Illegal ranges of incoming indices are caught too.
78121 + */
78122 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
78123 +{
78124 +       /*
78125 +        * this branch gets completely eliminated after inlining,
78126 +        * except when someone tries to use fixaddr indices in an
78127 +        * illegal way. (such as mixing up address types or using
78128 +        * out-of-range indices).
78129 +        *
78130 +        * If it doesn't get removed, the linker will complain
78131 +        * loudly with a reasonably clear error message..
78132 +        */
78133 +       if (idx >= __end_of_fixed_addresses)
78134 +               __this_fixmap_does_not_exist();
78135 +
78136 +        return __fix_to_virt(idx);
78137 +}
78138 +
78139 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
78140 +{
78141 +       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
78142 +       return __virt_to_fix(vaddr);
78143 +}
78144 +
78145 +#endif /* !__ASSEMBLY__ */
78146 +#endif
78147 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/floppy.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/floppy.h
78148 --- linux-2.6.19/include/asm-i386/mach-xen/asm/floppy.h 1970-01-01 00:00:00.000000000 +0000
78149 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/floppy.h       2007-02-02 19:10:55.000000000 +0000
78150 @@ -0,0 +1,147 @@
78151 +/*
78152 + * Architecture specific parts of the Floppy driver
78153 + *
78154 + * This file is subject to the terms and conditions of the GNU General Public
78155 + * License.  See the file "COPYING" in the main directory of this archive
78156 + * for more details.
78157 + *
78158 + * Copyright (C) 1995
78159 + *
78160 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
78161 + */
78162 +#ifndef __ASM_XEN_I386_FLOPPY_H
78163 +#define __ASM_XEN_I386_FLOPPY_H
78164 +
78165 +#include <linux/vmalloc.h>
78166 +
78167 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
78168 +#include <asm/dma.h>
78169 +#undef MAX_DMA_ADDRESS
78170 +#define MAX_DMA_ADDRESS 0
78171 +#define CROSS_64KB(a,s) (0)
78172 +
78173 +#define fd_inb(port)                   inb_p(port)
78174 +#define fd_outb(value,port)            outb_p(value,port)
78175 +
78176 +#define fd_request_dma()        (0)
78177 +#define fd_free_dma()           ((void)0)
78178 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
78179 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
78180 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
78181 +#define fd_get_dma_residue()    (virtual_dma_count + virtual_dma_residue)
78182 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
78183 +/*
78184 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
78185 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
78186 + */
78187 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
78188 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
78189 +
78190 +static int virtual_dma_count;
78191 +static int virtual_dma_residue;
78192 +static char *virtual_dma_addr;
78193 +static int virtual_dma_mode;
78194 +static int doing_pdma;
78195 +
78196 +static irqreturn_t floppy_hardint(int irq, void *dev_id)
78197 +{
78198 +       register unsigned char st;
78199 +       register int lcount;
78200 +       register char *lptr;
78201 +
78202 +       if (!doing_pdma)
78203 +               return floppy_interrupt(irq, dev_id);
78204 +
78205 +       st = 1;
78206 +       for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
78207 +           lcount; lcount--, lptr++) {
78208 +               st=inb(virtual_dma_port+4) & 0xa0 ;
78209 +               if(st != 0xa0) 
78210 +                       break;
78211 +               if(virtual_dma_mode)
78212 +                       outb_p(*lptr, virtual_dma_port+5);
78213 +               else
78214 +                       *lptr = inb_p(virtual_dma_port+5);
78215 +       }
78216 +       virtual_dma_count = lcount;
78217 +       virtual_dma_addr = lptr;
78218 +       st = inb(virtual_dma_port+4);
78219 +
78220 +       if(st == 0x20)
78221 +               return IRQ_HANDLED;
78222 +       if(!(st & 0x20)) {
78223 +               virtual_dma_residue += virtual_dma_count;
78224 +               virtual_dma_count=0;
78225 +               doing_pdma = 0;
78226 +               floppy_interrupt(irq, dev_id);
78227 +               return IRQ_HANDLED;
78228 +       }
78229 +       return IRQ_HANDLED;
78230 +}
78231 +
78232 +static void fd_disable_dma(void)
78233 +{
78234 +       doing_pdma = 0;
78235 +       virtual_dma_residue += virtual_dma_count;
78236 +       virtual_dma_count=0;
78237 +}
78238 +
78239 +static int fd_request_irq(void)
78240 +{
78241 +       return request_irq(FLOPPY_IRQ, floppy_hardint,
78242 +                          IRQF_DISABLED, "floppy", NULL);
78243 +}
78244 +
78245 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
78246 +{
78247 +       doing_pdma = 1;
78248 +       virtual_dma_port = io;
78249 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
78250 +       virtual_dma_addr = addr;
78251 +       virtual_dma_count = size;
78252 +       virtual_dma_residue = 0;
78253 +       return 0;
78254 +}
78255 +
78256 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
78257 +#define FDC1 xen_floppy_init()
78258 +static int FDC2 = -1;
78259 +
78260 +static int xen_floppy_init(void)
78261 +{
78262 +       use_virtual_dma = 1;
78263 +       can_use_virtual_dma = 1;
78264 +       return 0x3f0;
78265 +}
78266 +
78267 +/*
78268 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
78269 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
78270 + * coincides with another rtc CMOS user.               Paul G.
78271 + */
78272 +#define FLOPPY0_TYPE   ({                              \
78273 +       unsigned long flags;                            \
78274 +       unsigned char val;                              \
78275 +       spin_lock_irqsave(&rtc_lock, flags);            \
78276 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
78277 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
78278 +       val;                                            \
78279 +})
78280 +
78281 +#define FLOPPY1_TYPE   ({                              \
78282 +       unsigned long flags;                            \
78283 +       unsigned char val;                              \
78284 +       spin_lock_irqsave(&rtc_lock, flags);            \
78285 +       val = CMOS_READ(0x10) & 15;                     \
78286 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
78287 +       val;                                            \
78288 +})
78289 +
78290 +#define N_FDC 2
78291 +#define N_DRIVE 8
78292 +
78293 +#define FLOPPY_MOTOR_MASK 0xf0
78294 +
78295 +#define EXTRA_FLOPPY_PARAMS
78296 +
78297 +#endif /* __ASM_XEN_I386_FLOPPY_H */
78298 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/highmem.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/highmem.h
78299 --- linux-2.6.19/include/asm-i386/mach-xen/asm/highmem.h        1970-01-01 00:00:00.000000000 +0000
78300 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/highmem.h      2007-02-02 19:10:55.000000000 +0000
78301 @@ -0,0 +1,80 @@
78302 +/*
78303 + * highmem.h: virtual kernel memory mappings for high memory
78304 + *
78305 + * Used in CONFIG_HIGHMEM systems for memory pages which
78306 + * are not addressable by direct kernel virtual addresses.
78307 + *
78308 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
78309 + *                   Gerhard.Wichert@pdb.siemens.de
78310 + *
78311 + *
78312 + * Redesigned the x86 32-bit VM architecture to deal with 
78313 + * up to 16 Terabyte physical memory. With current x86 CPUs
78314 + * we now support up to 64 Gigabytes physical RAM.
78315 + *
78316 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
78317 + */
78318 +
78319 +#ifndef _ASM_HIGHMEM_H
78320 +#define _ASM_HIGHMEM_H
78321 +
78322 +#ifdef __KERNEL__
78323 +
78324 +#include <linux/interrupt.h>
78325 +#include <linux/threads.h>
78326 +#include <asm/kmap_types.h>
78327 +#include <asm/tlbflush.h>
78328 +
78329 +/* declarations for highmem.c */
78330 +extern unsigned long highstart_pfn, highend_pfn;
78331 +
78332 +extern pte_t *kmap_pte;
78333 +extern pgprot_t kmap_prot;
78334 +extern pte_t *pkmap_page_table;
78335 +
78336 +/*
78337 + * Right now we initialize only a single pte table. It can be extended
78338 + * easily, subsequent pte tables have to be allocated in one physical
78339 + * chunk of RAM.
78340 + */
78341 +#ifdef CONFIG_X86_PAE
78342 +#define LAST_PKMAP 512
78343 +#else
78344 +#define LAST_PKMAP 1024
78345 +#endif
78346 +/*
78347 + * Ordering is:
78348 + *
78349 + * FIXADDR_TOP
78350 + *                     fixed_addresses
78351 + * FIXADDR_START
78352 + *                     temp fixed addresses
78353 + * FIXADDR_BOOT_START
78354 + *                     Persistent kmap area
78355 + * PKMAP_BASE
78356 + * VMALLOC_END
78357 + *                     Vmalloc area
78358 + * VMALLOC_START
78359 + * high_memory
78360 + */
78361 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
78362 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
78363 +#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
78364 +#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
78365 +
78366 +extern void * FASTCALL(kmap_high(struct page *page));
78367 +extern void FASTCALL(kunmap_high(struct page *page));
78368 +
78369 +void *kmap(struct page *page);
78370 +void kunmap(struct page *page);
78371 +void *kmap_atomic(struct page *page, enum km_type type);
78372 +void *kmap_atomic_pte(struct page *page, enum km_type type);
78373 +void kunmap_atomic(void *kvaddr, enum km_type type);
78374 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
78375 +struct page *kmap_atomic_to_page(void *ptr);
78376 +
78377 +#define flush_cache_kmaps()    do { } while (0)
78378 +
78379 +#endif /* __KERNEL__ */
78380 +
78381 +#endif /* _ASM_HIGHMEM_H */
78382 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/hypercall.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/hypercall.h
78383 --- linux-2.6.19/include/asm-i386/mach-xen/asm/hypercall.h      1970-01-01 00:00:00.000000000 +0000
78384 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/hypercall.h    2007-02-02 19:10:55.000000000 +0000
78385 @@ -0,0 +1,407 @@
78386 +/******************************************************************************
78387 + * hypercall.h
78388 + * 
78389 + * Linux-specific hypervisor handling.
78390 + * 
78391 + * Copyright (c) 2002-2004, K A Fraser
78392 + * 
78393 + * This program is free software; you can redistribute it and/or
78394 + * modify it under the terms of the GNU General Public License version 2
78395 + * as published by the Free Software Foundation; or, when distributed
78396 + * separately from the Linux kernel or incorporated into other
78397 + * software packages, subject to the following license:
78398 + * 
78399 + * Permission is hereby granted, free of charge, to any person obtaining a copy
78400 + * of this source file (the "Software"), to deal in the Software without
78401 + * restriction, including without limitation the rights to use, copy, modify,
78402 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
78403 + * and to permit persons to whom the Software is furnished to do so, subject to
78404 + * the following conditions:
78405 + * 
78406 + * The above copyright notice and this permission notice shall be included in
78407 + * all copies or substantial portions of the Software.
78408 + * 
78409 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
78410 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
78411 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
78412 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
78413 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
78414 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
78415 + * IN THE SOFTWARE.
78416 + */
78417 +
78418 +#ifndef __HYPERCALL_H__
78419 +#define __HYPERCALL_H__
78420 +
78421 +#include <linux/string.h> /* memcpy() */
78422 +
78423 +#ifndef __HYPERVISOR_H__
78424 +# error "please don't include this file directly"
78425 +#endif
78426 +
78427 +#define __STR(x) #x
78428 +#define STR(x) __STR(x)
78429 +
78430 +#ifdef CONFIG_XEN
78431 +#define HYPERCALL_STR(name)                                    \
78432 +       "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
78433 +#else
78434 +#define HYPERCALL_STR(name)                                    \
78435 +       "mov hypercall_stubs,%%eax; "                           \
78436 +       "add $("STR(__HYPERVISOR_##name)" * 32),%%eax; "        \
78437 +       "call *%%eax"
78438 +#endif
78439 +
78440 +#define _hypercall0(type, name)                        \
78441 +({                                             \
78442 +       long __res;                             \
78443 +       asm volatile (                          \
78444 +               HYPERCALL_STR(name)             \
78445 +               : "=a" (__res)                  \
78446 +               :                               \
78447 +               : "memory" );                   \
78448 +       (type)__res;                            \
78449 +})
78450 +
78451 +#define _hypercall1(type, name, a1)                            \
78452 +({                                                             \
78453 +       long __res, __ign1;                                     \
78454 +       asm volatile (                                          \
78455 +               HYPERCALL_STR(name)                             \
78456 +               : "=a" (__res), "=b" (__ign1)                   \
78457 +               : "1" ((long)(a1))                              \
78458 +               : "memory" );                                   \
78459 +       (type)__res;                                            \
78460 +})
78461 +
78462 +#define _hypercall2(type, name, a1, a2)                                \
78463 +({                                                             \
78464 +       long __res, __ign1, __ign2;                             \
78465 +       asm volatile (                                          \
78466 +               HYPERCALL_STR(name)                             \
78467 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2)    \
78468 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
78469 +               : "memory" );                                   \
78470 +       (type)__res;                                            \
78471 +})
78472 +
78473 +#define _hypercall3(type, name, a1, a2, a3)                    \
78474 +({                                                             \
78475 +       long __res, __ign1, __ign2, __ign3;                     \
78476 +       asm volatile (                                          \
78477 +               HYPERCALL_STR(name)                             \
78478 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
78479 +               "=d" (__ign3)                                   \
78480 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
78481 +               "3" ((long)(a3))                                \
78482 +               : "memory" );                                   \
78483 +       (type)__res;                                            \
78484 +})
78485 +
78486 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
78487 +({                                                             \
78488 +       long __res, __ign1, __ign2, __ign3, __ign4;             \
78489 +       asm volatile (                                          \
78490 +               HYPERCALL_STR(name)                             \
78491 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
78492 +               "=d" (__ign3), "=S" (__ign4)                    \
78493 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
78494 +               "3" ((long)(a3)), "4" ((long)(a4))              \
78495 +               : "memory" );                                   \
78496 +       (type)__res;                                            \
78497 +})
78498 +
78499 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
78500 +({                                                             \
78501 +       long __res, __ign1, __ign2, __ign3, __ign4, __ign5;     \
78502 +       asm volatile (                                          \
78503 +               HYPERCALL_STR(name)                             \
78504 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
78505 +               "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)     \
78506 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
78507 +               "3" ((long)(a3)), "4" ((long)(a4)),             \
78508 +               "5" ((long)(a5))                                \
78509 +               : "memory" );                                   \
78510 +       (type)__res;                                            \
78511 +})
78512 +
78513 +static inline int
78514 +HYPERVISOR_set_trap_table(
78515 +       trap_info_t *table)
78516 +{
78517 +       return _hypercall1(int, set_trap_table, table);
78518 +}
78519 +
78520 +static inline int
78521 +HYPERVISOR_mmu_update(
78522 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
78523 +{
78524 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
78525 +}
78526 +
78527 +static inline int
78528 +HYPERVISOR_mmuext_op(
78529 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
78530 +{
78531 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
78532 +}
78533 +
78534 +static inline int
78535 +HYPERVISOR_set_gdt(
78536 +       unsigned long *frame_list, int entries)
78537 +{
78538 +       return _hypercall2(int, set_gdt, frame_list, entries);
78539 +}
78540 +
78541 +static inline int
78542 +HYPERVISOR_stack_switch(
78543 +       unsigned long ss, unsigned long esp)
78544 +{
78545 +       return _hypercall2(int, stack_switch, ss, esp);
78546 +}
78547 +
78548 +static inline int
78549 +HYPERVISOR_set_callbacks(
78550 +       unsigned long event_selector, unsigned long event_address,
78551 +       unsigned long failsafe_selector, unsigned long failsafe_address)
78552 +{
78553 +       return _hypercall4(int, set_callbacks,
78554 +                          event_selector, event_address,
78555 +                          failsafe_selector, failsafe_address);
78556 +}
78557 +
78558 +static inline int
78559 +HYPERVISOR_fpu_taskswitch(
78560 +       int set)
78561 +{
78562 +       return _hypercall1(int, fpu_taskswitch, set);
78563 +}
78564 +
78565 +static inline int
78566 +HYPERVISOR_sched_op_compat(
78567 +       int cmd, unsigned long arg)
78568 +{
78569 +       return _hypercall2(int, sched_op_compat, cmd, arg);
78570 +}
78571 +
78572 +static inline int
78573 +HYPERVISOR_sched_op(
78574 +       int cmd, void *arg)
78575 +{
78576 +       return _hypercall2(int, sched_op, cmd, arg);
78577 +}
78578 +
78579 +static inline long
78580 +HYPERVISOR_set_timer_op(
78581 +       u64 timeout)
78582 +{
78583 +       unsigned long timeout_hi = (unsigned long)(timeout>>32);
78584 +       unsigned long timeout_lo = (unsigned long)timeout;
78585 +       return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
78586 +}
78587 +
78588 +static inline int
78589 +HYPERVISOR_dom0_op(
78590 +       dom0_op_t *dom0_op)
78591 +{
78592 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
78593 +       return _hypercall1(int, dom0_op, dom0_op);
78594 +}
78595 +
78596 +static inline int
78597 +HYPERVISOR_set_debugreg(
78598 +       int reg, unsigned long value)
78599 +{
78600 +       return _hypercall2(int, set_debugreg, reg, value);
78601 +}
78602 +
78603 +static inline unsigned long
78604 +HYPERVISOR_get_debugreg(
78605 +       int reg)
78606 +{
78607 +       return _hypercall1(unsigned long, get_debugreg, reg);
78608 +}
78609 +
78610 +static inline int
78611 +HYPERVISOR_update_descriptor(
78612 +       u64 ma, u64 desc)
78613 +{
78614 +       return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
78615 +}
78616 +
78617 +static inline int
78618 +HYPERVISOR_memory_op(
78619 +       unsigned int cmd, void *arg)
78620 +{
78621 +       return _hypercall2(int, memory_op, cmd, arg);
78622 +}
78623 +
78624 +static inline int
78625 +HYPERVISOR_multicall(
78626 +       void *call_list, int nr_calls)
78627 +{
78628 +       return _hypercall2(int, multicall, call_list, nr_calls);
78629 +}
78630 +
78631 +static inline int
78632 +HYPERVISOR_update_va_mapping(
78633 +       unsigned long va, pte_t new_val, unsigned long flags)
78634 +{
78635 +       unsigned long pte_hi = 0;
78636 +#ifdef CONFIG_X86_PAE
78637 +       pte_hi = new_val.pte_high;
78638 +#endif
78639 +       return _hypercall4(int, update_va_mapping, va,
78640 +                          new_val.pte_low, pte_hi, flags);
78641 +}
78642 +
78643 +static inline int
78644 +HYPERVISOR_event_channel_op(
78645 +       int cmd, void *arg)
78646 +{
78647 +       int rc = _hypercall2(int, event_channel_op, cmd, arg);
78648 +
78649 +#ifdef CONFIG_XEN_COMPAT_030002
78650 +       if (unlikely(rc == -ENOSYS)) {
78651 +               struct evtchn_op op;
78652 +               op.cmd = cmd;
78653 +               memcpy(&op.u, arg, sizeof(op.u));
78654 +               rc = _hypercall1(int, event_channel_op_compat, &op);
78655 +               memcpy(arg, &op.u, sizeof(op.u));
78656 +       }
78657 +#endif
78658 +
78659 +       return rc;
78660 +}
78661 +
78662 +static inline int
78663 +HYPERVISOR_acm_op(
78664 +       int cmd, void *arg)
78665 +{
78666 +       return _hypercall2(int, acm_op, cmd, arg);
78667 +}
78668 +
78669 +static inline int
78670 +HYPERVISOR_xen_version(
78671 +       int cmd, void *arg)
78672 +{
78673 +       return _hypercall2(int, xen_version, cmd, arg);
78674 +}
78675 +
78676 +static inline int
78677 +HYPERVISOR_console_io(
78678 +       int cmd, int count, char *str)
78679 +{
78680 +       return _hypercall3(int, console_io, cmd, count, str);
78681 +}
78682 +
78683 +static inline int
78684 +HYPERVISOR_physdev_op(
78685 +       int cmd, void *arg)
78686 +{
78687 +       int rc = _hypercall2(int, physdev_op, cmd, arg);
78688 +
78689 +#ifdef CONFIG_XEN_COMPAT_030002
78690 +       if (unlikely(rc == -ENOSYS)) {
78691 +               struct physdev_op op;
78692 +               op.cmd = cmd;
78693 +               memcpy(&op.u, arg, sizeof(op.u));
78694 +               rc = _hypercall1(int, physdev_op_compat, &op);
78695 +               memcpy(arg, &op.u, sizeof(op.u));
78696 +       }
78697 +#endif
78698 +
78699 +       return rc;
78700 +}
78701 +
78702 +static inline int
78703 +HYPERVISOR_grant_table_op(
78704 +       unsigned int cmd, void *uop, unsigned int count)
78705 +{
78706 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
78707 +}
78708 +
78709 +static inline int
78710 +HYPERVISOR_update_va_mapping_otherdomain(
78711 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
78712 +{
78713 +       unsigned long pte_hi = 0;
78714 +#ifdef CONFIG_X86_PAE
78715 +       pte_hi = new_val.pte_high;
78716 +#endif
78717 +       return _hypercall5(int, update_va_mapping_otherdomain, va,
78718 +                          new_val.pte_low, pte_hi, flags, domid);
78719 +}
78720 +
78721 +static inline int
78722 +HYPERVISOR_vm_assist(
78723 +       unsigned int cmd, unsigned int type)
78724 +{
78725 +       return _hypercall2(int, vm_assist, cmd, type);
78726 +}
78727 +
78728 +static inline int
78729 +HYPERVISOR_vcpu_op(
78730 +       int cmd, int vcpuid, void *extra_args)
78731 +{
78732 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
78733 +}
78734 +
78735 +static inline int
78736 +HYPERVISOR_suspend(
78737 +       unsigned long srec)
78738 +{
78739 +       struct sched_shutdown sched_shutdown = {
78740 +               .reason = SHUTDOWN_suspend
78741 +       };
78742 +
78743 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
78744 +                            &sched_shutdown, srec);
78745 +
78746 +#ifdef CONFIG_XEN_COMPAT_030002
78747 +       if (rc == -ENOSYS)
78748 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
78749 +                                SHUTDOWN_suspend, srec);
78750 +#endif
78751 +
78752 +       return rc;
78753 +}
78754 +
78755 +static inline int
78756 +HYPERVISOR_nmi_op(
78757 +       unsigned long op, void *arg)
78758 +{
78759 +       return _hypercall2(int, nmi_op, op, arg);
78760 +}
78761 +
78762 +static inline unsigned long
78763 +HYPERVISOR_hvm_op(
78764 +    int op, void *arg)
78765 +{
78766 +    return _hypercall2(unsigned long, hvm_op, op, arg);
78767 +}
78768 +
78769 +static inline int
78770 +HYPERVISOR_callback_op(
78771 +       int cmd, void *arg)
78772 +{
78773 +       return _hypercall2(int, callback_op, cmd, arg);
78774 +}
78775 +
78776 +static inline int
78777 +HYPERVISOR_xenoprof_op(
78778 +       int op, void *arg)
78779 +{
78780 +       return _hypercall2(int, xenoprof_op, op, arg);
78781 +}
78782 +
78783 +static inline int
78784 +HYPERVISOR_kexec_op(
78785 +       unsigned long op, void *args)
78786 +{
78787 +       return _hypercall2(int, kexec_op, op, args);
78788 +}
78789 +
78790 +
78791 +
78792 +#endif /* __HYPERCALL_H__ */
78793 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/hypervisor.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/hypervisor.h
78794 --- linux-2.6.19/include/asm-i386/mach-xen/asm/hypervisor.h     1970-01-01 00:00:00.000000000 +0000
78795 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/hypervisor.h   2007-02-02 19:10:55.000000000 +0000
78796 @@ -0,0 +1,245 @@
78797 +/******************************************************************************
78798 + * hypervisor.h
78799 + * 
78800 + * Linux-specific hypervisor handling.
78801 + * 
78802 + * Copyright (c) 2002-2004, K A Fraser
78803 + * 
78804 + * This program is free software; you can redistribute it and/or
78805 + * modify it under the terms of the GNU General Public License version 2
78806 + * as published by the Free Software Foundation; or, when distributed
78807 + * separately from the Linux kernel or incorporated into other
78808 + * software packages, subject to the following license:
78809 + * 
78810 + * Permission is hereby granted, free of charge, to any person obtaining a copy
78811 + * of this source file (the "Software"), to deal in the Software without
78812 + * restriction, including without limitation the rights to use, copy, modify,
78813 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
78814 + * and to permit persons to whom the Software is furnished to do so, subject to
78815 + * the following conditions:
78816 + * 
78817 + * The above copyright notice and this permission notice shall be included in
78818 + * all copies or substantial portions of the Software.
78819 + * 
78820 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
78821 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
78822 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
78823 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
78824 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
78825 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
78826 + * IN THE SOFTWARE.
78827 + */
78828 +
78829 +#ifndef __HYPERVISOR_H__
78830 +#define __HYPERVISOR_H__
78831 +
78832 +#include <linux/types.h>
78833 +#include <linux/kernel.h>
78834 +#include <linux/version.h>
78835 +#include <linux/errno.h>
78836 +#include <xen/interface/xen.h>
78837 +#include <xen/interface/dom0_ops.h>
78838 +#include <xen/interface/event_channel.h>
78839 +#include <xen/interface/physdev.h>
78840 +#include <xen/interface/sched.h>
78841 +#include <xen/interface/nmi.h>
78842 +#include <asm/ptrace.h>
78843 +#include <asm/page.h>
78844 +#if defined(__i386__)
78845 +#  ifdef CONFIG_X86_PAE
78846 +#   include <asm-generic/pgtable-nopud.h>
78847 +#  else
78848 +#   include <asm-generic/pgtable-nopmd.h>
78849 +#  endif
78850 +#endif
78851 +
78852 +extern shared_info_t *HYPERVISOR_shared_info;
78853 +
78854 +#ifdef CONFIG_X86_32
78855 +extern unsigned long hypervisor_virt_start;
78856 +#endif
78857 +
78858 +/* arch/xen/i386/kernel/setup.c */
78859 +extern start_info_t *xen_start_info;
78860 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
78861 +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
78862 +#else
78863 +#define is_initial_xendomain() 0
78864 +#endif
78865 +
78866 +/* arch/xen/kernel/evtchn.c */
78867 +/* Force a proper event-channel callback from Xen. */
78868 +void force_evtchn_callback(void);
78869 +
78870 +/* arch/xen/kernel/process.c */
78871 +void xen_cpu_idle (void);
78872 +
78873 +/* arch/xen/i386/kernel/hypervisor.c */
78874 +void do_hypervisor_callback(struct pt_regs *regs);
78875 +
78876 +/* arch/xen/i386/mm/hypervisor.c */
78877 +/*
78878 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
78879 + * be MACHINE addresses.
78880 + */
78881 +
78882 +void xen_pt_switch(unsigned long ptr);
78883 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
78884 +void xen_load_gs(unsigned int selector); /* x86_64 only */
78885 +void xen_tlb_flush(void);
78886 +void xen_invlpg(unsigned long ptr);
78887 +
78888 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
78889 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
78890 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
78891 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
78892 +void xen_pgd_pin(unsigned long ptr);
78893 +void xen_pgd_unpin(unsigned long ptr);
78894 +
78895 +void xen_set_ldt(unsigned long ptr, unsigned long bytes);
78896 +
78897 +#ifdef CONFIG_SMP
78898 +#include <linux/cpumask.h>
78899 +void xen_tlb_flush_all(void);
78900 +void xen_invlpg_all(unsigned long ptr);
78901 +void xen_tlb_flush_mask(cpumask_t *mask);
78902 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
78903 +#endif
78904 +
78905 +/* Returns zero on success else negative errno. */
78906 +int xen_create_contiguous_region(
78907 +    unsigned long vstart, unsigned int order, unsigned int address_bits);
78908 +void xen_destroy_contiguous_region(
78909 +    unsigned long vstart, unsigned int order);
78910 +
78911 +/* Turn jiffies into Xen system time. */
78912 +u64 jiffies_to_st(unsigned long jiffies);
78913 +
78914 +#include <asm/hypercall.h>
78915 +
78916 +#if defined(CONFIG_X86_64)
78917 +#define MULTI_UVMFLAGS_INDEX 2
78918 +#define MULTI_UVMDOMID_INDEX 3
78919 +#else
78920 +#define MULTI_UVMFLAGS_INDEX 3
78921 +#define MULTI_UVMDOMID_INDEX 4
78922 +#endif
78923 +
78924 +#define is_running_on_xen() 1
78925 +
78926 +static inline int
78927 +HYPERVISOR_yield(
78928 +       void)
78929 +{
78930 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
78931 +
78932 +#ifdef CONFIG_XEN_COMPAT_030002
78933 +       if (rc == -ENOSYS)
78934 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
78935 +#endif
78936 +
78937 +       return rc;
78938 +}
78939 +
78940 +static inline int
78941 +HYPERVISOR_block(
78942 +       void)
78943 +{
78944 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
78945 +
78946 +#ifdef CONFIG_XEN_COMPAT_030002
78947 +       if (rc == -ENOSYS)
78948 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
78949 +#endif
78950 +
78951 +       return rc;
78952 +}
78953 +
78954 +static inline int
78955 +HYPERVISOR_shutdown(
78956 +       unsigned int reason)
78957 +{
78958 +       struct sched_shutdown sched_shutdown = {
78959 +               .reason = reason
78960 +       };
78961 +
78962 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
78963 +
78964 +#ifdef CONFIG_XEN_COMPAT_030002
78965 +       if (rc == -ENOSYS)
78966 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
78967 +#endif
78968 +
78969 +       return rc;
78970 +}
78971 +
78972 +static inline int
78973 +HYPERVISOR_poll(
78974 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
78975 +{
78976 +       int rc;
78977 +       struct sched_poll sched_poll = {
78978 +               .nr_ports = nr_ports,
78979 +               .timeout = jiffies_to_st(timeout)
78980 +       };
78981 +       set_xen_guest_handle(sched_poll.ports, ports);
78982 +
78983 +       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
78984 +#ifdef CONFIG_XEN_COMPAT_030002
78985 +       if (rc == -ENOSYS)
78986 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
78987 +#endif
78988 +
78989 +       return rc;
78990 +}
78991 +
78992 +static inline void
78993 +MULTI_update_va_mapping(
78994 +    multicall_entry_t *mcl, unsigned long va,
78995 +    pte_t new_val, unsigned long flags)
78996 +{
78997 +    mcl->op = __HYPERVISOR_update_va_mapping;
78998 +    mcl->args[0] = va;
78999 +#if defined(CONFIG_X86_64)
79000 +    mcl->args[1] = new_val.pte;
79001 +#elif defined(CONFIG_X86_PAE)
79002 +    mcl->args[1] = new_val.pte_low;
79003 +    mcl->args[2] = new_val.pte_high;
79004 +#else
79005 +    mcl->args[1] = new_val.pte_low;
79006 +    mcl->args[2] = 0;
79007 +#endif
79008 +    mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
79009 +}
79010 +
79011 +static inline void
79012 +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
79013 +                    void *uop, unsigned int count)
79014 +{
79015 +    mcl->op = __HYPERVISOR_grant_table_op;
79016 +    mcl->args[0] = cmd;
79017 +    mcl->args[1] = (unsigned long)uop;
79018 +    mcl->args[2] = count;
79019 +}
79020 +
79021 +static inline void
79022 +MULTI_update_va_mapping_otherdomain(
79023 +    multicall_entry_t *mcl, unsigned long va,
79024 +    pte_t new_val, unsigned long flags, domid_t domid)
79025 +{
79026 +    mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
79027 +    mcl->args[0] = va;
79028 +#if defined(CONFIG_X86_64)
79029 +    mcl->args[1] = new_val.pte;
79030 +#elif defined(CONFIG_X86_PAE)
79031 +    mcl->args[1] = new_val.pte_low;
79032 +    mcl->args[2] = new_val.pte_high;
79033 +#else
79034 +    mcl->args[1] = new_val.pte_low;
79035 +    mcl->args[2] = 0;
79036 +#endif
79037 +    mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
79038 +    mcl->args[MULTI_UVMDOMID_INDEX] = domid;
79039 +}
79040 +
79041 +#endif /* __HYPERVISOR_H__ */
79042 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/io.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/io.h
79043 --- linux-2.6.19/include/asm-i386/mach-xen/asm/io.h     1970-01-01 00:00:00.000000000 +0000
79044 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/io.h   2007-02-02 19:10:55.000000000 +0000
79045 @@ -0,0 +1,363 @@
79046 +#ifndef _ASM_IO_H
79047 +#define _ASM_IO_H
79048 +
79049 +#include <linux/string.h>
79050 +#include <linux/compiler.h>
79051 +
79052 +/*
79053 + * This file contains the definitions for the x86 IO instructions
79054 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
79055 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
79056 + * versions of the single-IO instructions (inb_p/inw_p/..).
79057 + *
79058 + * This file is not meant to be obfuscating: it's just complicated
79059 + * to (a) handle it all in a way that makes gcc able to optimize it
79060 + * as well as possible and (b) trying to avoid writing the same thing
79061 + * over and over again with slight variations and possibly making a
79062 + * mistake somewhere.
79063 + */
79064 +
79065 +/*
79066 + * Thanks to James van Artsdalen for a better timing-fix than
79067 + * the two short jumps: using outb's to a nonexistent port seems
79068 + * to guarantee better timings even on fast machines.
79069 + *
79070 + * On the other hand, I'd like to be sure of a non-existent port:
79071 + * I feel a bit unsafe about using 0x80 (should be safe, though)
79072 + *
79073 + *             Linus
79074 + */
79075 +
79076 + /*
79077 +  *  Bit simplified and optimized by Jan Hubicka
79078 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
79079 +  *
79080 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
79081 +  *  isa_read[wl] and isa_write[wl] fixed
79082 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
79083 +  */
79084 +
79085 +#define IO_SPACE_LIMIT 0xffff
79086 +
79087 +#define XQUAD_PORTIO_BASE 0xfe400000
79088 +#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
79089 +
79090 +#ifdef __KERNEL__
79091 +
79092 +#include <asm-generic/iomap.h>
79093 +
79094 +#include <linux/vmalloc.h>
79095 +#include <asm/fixmap.h>
79096 +
79097 +/*
79098 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
79099 + * access
79100 + */
79101 +#define xlate_dev_mem_ptr(p, sz)       ioremap(p, sz)
79102 +#define xlate_dev_mem_ptr_unmap(p)     iounmap(p)
79103 +
79104 +/*
79105 + * Convert a virtual cached pointer to an uncached pointer
79106 + */
79107 +#define xlate_dev_kmem_ptr(p)  p
79108 +
79109 +/**
79110 + *     virt_to_phys    -       map virtual addresses to physical
79111 + *     @address: address to remap
79112 + *
79113 + *     The returned physical address is the physical (CPU) mapping for
79114 + *     the memory address given. It is only valid to use this function on
79115 + *     addresses directly mapped or allocated via kmalloc. 
79116 + *
79117 + *     This function does not give bus mappings for DMA transfers. In
79118 + *     almost all conceivable cases a device driver should not be using
79119 + *     this function
79120 + */
79121
79122 +static inline unsigned long virt_to_phys(volatile void * address)
79123 +{
79124 +       return __pa(address);
79125 +}
79126 +
79127 +/**
79128 + *     phys_to_virt    -       map physical address to virtual
79129 + *     @address: address to remap
79130 + *
79131 + *     The returned virtual address is a current CPU mapping for
79132 + *     the memory address given. It is only valid to use this function on
79133 + *     addresses that have a kernel mapping
79134 + *
79135 + *     This function does not handle bus mappings for DMA transfers. In
79136 + *     almost all conceivable cases a device driver should not be using
79137 + *     this function
79138 + */
79139 +
79140 +static inline void * phys_to_virt(unsigned long address)
79141 +{
79142 +       return __va(address);
79143 +}
79144 +
79145 +/*
79146 + * Change "struct page" to physical address.
79147 + */
79148 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
79149 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
79150 +#define page_to_bus(page)       (phys_to_machine(page_to_pseudophys(page)))
79151 +
79152 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
79153 +                                 (unsigned long) bio_offset((bio)))
79154 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
79155 +                                 (unsigned long) (bv)->bv_offset)
79156 +
79157 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
79158 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
79159 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
79160 +         bvec_to_pseudophys((vec2))))
79161 +
79162 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
79163 +
79164 +/**
79165 + * ioremap     -   map bus memory into CPU space
79166 + * @offset:    bus address of the memory
79167 + * @size:      size of the resource to map
79168 + *
79169 + * ioremap performs a platform specific sequence of operations to
79170 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
79171 + * writew/writel functions and the other mmio helpers. The returned
79172 + * address is not guaranteed to be usable directly as a virtual
79173 + * address. 
79174 + */
79175 +
79176 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
79177 +{
79178 +       return __ioremap(offset, size, 0);
79179 +}
79180 +
79181 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
79182 +extern void iounmap(volatile void __iomem *addr);
79183 +
79184 +/*
79185 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
79186 + * mappings, before the real ioremap() is functional.
79187 + * A boot-time mapping is currently limited to at most 16 pages.
79188 + */
79189 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
79190 +extern void bt_iounmap(void *addr, unsigned long size);
79191 +
79192 +/* Use early IO mappings for DMI because it's initialized early */
79193 +#define dmi_ioremap bt_ioremap
79194 +#define dmi_iounmap bt_iounmap
79195 +#define dmi_alloc alloc_bootmem
79196 +
79197 +/*
79198 + * ISA I/O bus memory addresses are 1:1 with the physical address.
79199 + */
79200 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
79201 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
79202 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
79203 +
79204 +/*
79205 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
79206 + * are forbidden in portable PCI drivers.
79207 + *
79208 + * Allow them on x86 for legacy drivers, though.
79209 + */
79210 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
79211 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
79212 +
79213 +/*
79214 + * readX/writeX() are used to access memory mapped devices. On some
79215 + * architectures the memory mapped IO stuff needs to be accessed
79216 + * differently. On the x86 architecture, we just read/write the
79217 + * memory location directly.
79218 + */
79219 +
79220 +static inline unsigned char readb(const volatile void __iomem *addr)
79221 +{
79222 +       return *(volatile unsigned char __force *) addr;
79223 +}
79224 +static inline unsigned short readw(const volatile void __iomem *addr)
79225 +{
79226 +       return *(volatile unsigned short __force *) addr;
79227 +}
79228 +static inline unsigned int readl(const volatile void __iomem *addr)
79229 +{
79230 +       return *(volatile unsigned int __force *) addr;
79231 +}
79232 +#define readb_relaxed(addr) readb(addr)
79233 +#define readw_relaxed(addr) readw(addr)
79234 +#define readl_relaxed(addr) readl(addr)
79235 +#define __raw_readb readb
79236 +#define __raw_readw readw
79237 +#define __raw_readl readl
79238 +
79239 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
79240 +{
79241 +       *(volatile unsigned char __force *) addr = b;
79242 +}
79243 +static inline void writew(unsigned short b, volatile void __iomem *addr)
79244 +{
79245 +       *(volatile unsigned short __force *) addr = b;
79246 +}
79247 +static inline void writel(unsigned int b, volatile void __iomem *addr)
79248 +{
79249 +       *(volatile unsigned int __force *) addr = b;
79250 +}
79251 +#define __raw_writeb writeb
79252 +#define __raw_writew writew
79253 +#define __raw_writel writel
79254 +
79255 +#define mmiowb()
79256 +
79257 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
79258 +{
79259 +       memset((void __force *) addr, val, count);
79260 +}
79261 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
79262 +{
79263 +       __memcpy(dst, (void __force *) src, count);
79264 +}
79265 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
79266 +{
79267 +       __memcpy((void __force *) dst, src, count);
79268 +}
79269 +
79270 +/*
79271 + * ISA space is 'always mapped' on a typical x86 system, no need to
79272 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
79273 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
79274 + * are physical addresses. The following constant pointer can be
79275 + * used as the IO-area pointer (it can be iounmapped as well, so the
79276 + * analogy with PCI is quite large):
79277 + */
79278 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
79279 +
79280 +/*
79281 + * Again, i386 does not require mem IO specific function.
79282 + */
79283 +
79284 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void __force *)(b),(c),(d))
79285 +
79286 +/*
79287 + *     Cache management
79288 + *
79289 + *     This needed for two cases
79290 + *     1. Out of order aware processors
79291 + *     2. Accidentally out of order processors (PPro errata #51)
79292 + */
79293
79294 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
79295 +
79296 +static inline void flush_write_buffers(void)
79297 +{
79298 +       __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
79299 +}
79300 +
79301 +#define dma_cache_inv(_start,_size)            flush_write_buffers()
79302 +#define dma_cache_wback(_start,_size)          flush_write_buffers()
79303 +#define dma_cache_wback_inv(_start,_size)      flush_write_buffers()
79304 +
79305 +#else
79306 +
79307 +/* Nothing to do */
79308 +
79309 +#define dma_cache_inv(_start,_size)            do { } while (0)
79310 +#define dma_cache_wback(_start,_size)          do { } while (0)
79311 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
79312 +#define flush_write_buffers()
79313 +
79314 +#endif
79315 +
79316 +#endif /* __KERNEL__ */
79317 +
79318 +#ifdef SLOW_IO_BY_JUMPING
79319 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
79320 +#else
79321 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
79322 +#endif
79323 +
79324 +static inline void slow_down_io(void) {
79325 +       __asm__ __volatile__(
79326 +               __SLOW_DOWN_IO
79327 +#ifdef REALLY_SLOW_IO
79328 +               __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
79329 +#endif
79330 +               : : );
79331 +}
79332 +
79333 +#ifdef CONFIG_X86_NUMAQ
79334 +extern void *xquad_portio;    /* Where the IO area was mapped */
79335 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
79336 +#define __BUILDIO(bwl,bw,type) \
79337 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
79338 +       if (xquad_portio) \
79339 +               write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
79340 +       else \
79341 +               out##bwl##_local(value, port); \
79342 +} \
79343 +static inline void out##bwl(unsigned type value, int port) { \
79344 +       out##bwl##_quad(value, port, 0); \
79345 +} \
79346 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
79347 +       if (xquad_portio) \
79348 +               return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
79349 +       else \
79350 +               return in##bwl##_local(port); \
79351 +} \
79352 +static inline unsigned type in##bwl(int port) { \
79353 +       return in##bwl##_quad(port, 0); \
79354 +}
79355 +#else
79356 +#define __BUILDIO(bwl,bw,type) \
79357 +static inline void out##bwl(unsigned type value, int port) { \
79358 +       out##bwl##_local(value, port); \
79359 +} \
79360 +static inline unsigned type in##bwl(int port) { \
79361 +       return in##bwl##_local(port); \
79362 +}
79363 +#endif
79364 +
79365 +
79366 +#define BUILDIO(bwl,bw,type) \
79367 +static inline void out##bwl##_local(unsigned type value, int port) { \
79368 +       __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
79369 +} \
79370 +static inline unsigned type in##bwl##_local(int port) { \
79371 +       unsigned type value; \
79372 +       __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
79373 +       return value; \
79374 +} \
79375 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
79376 +       out##bwl##_local(value, port); \
79377 +       slow_down_io(); \
79378 +} \
79379 +static inline unsigned type in##bwl##_local_p(int port) { \
79380 +       unsigned type value = in##bwl##_local(port); \
79381 +       slow_down_io(); \
79382 +       return value; \
79383 +} \
79384 +__BUILDIO(bwl,bw,type) \
79385 +static inline void out##bwl##_p(unsigned type value, int port) { \
79386 +       out##bwl(value, port); \
79387 +       slow_down_io(); \
79388 +} \
79389 +static inline unsigned type in##bwl##_p(int port) { \
79390 +       unsigned type value = in##bwl(port); \
79391 +       slow_down_io(); \
79392 +       return value; \
79393 +} \
79394 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
79395 +       __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
79396 +} \
79397 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
79398 +       __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
79399 +}
79400 +
79401 +BUILDIO(b,b,char)
79402 +BUILDIO(w,w,short)
79403 +BUILDIO(l,,int)
79404 +
79405 +/* We will be supplying our own /dev/mem implementation */
79406 +#define ARCH_HAS_DEV_MEM
79407 +
79408 +#endif
79409 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/irqflags.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/irqflags.h
79410 --- linux-2.6.19/include/asm-i386/mach-xen/asm/irqflags.h       1970-01-01 00:00:00.000000000 +0000
79411 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/irqflags.h     2007-02-02 19:10:55.000000000 +0000
79412 @@ -0,0 +1,81 @@
79413 +/*
79414 + * include/asm-i386/irqflags.h
79415 + *
79416 + * IRQ flags handling
79417 + *
79418 + * This file gets included from lowlevel asm headers too, to provide
79419 + * wrapped versions of the local_irq_*() APIs, based on the
79420 + * raw_local_irq_*() functions from the lowlevel headers.
79421 + */
79422 +#ifndef _ASM_IRQFLAGS_H
79423 +#define _ASM_IRQFLAGS_H
79424 +
79425 +#ifndef __ASSEMBLY__
79426 +
79427 +unsigned long __raw_local_save_flags(void);
79428 +
79429 +#define raw_local_save_flags(flags) \
79430 +               do { (flags) = __raw_local_save_flags(); } while (0)
79431 +
79432 +void raw_local_irq_restore(unsigned long flags);
79433 +void raw_local_irq_disable(void);
79434 +void raw_local_irq_enable(void);
79435 +
79436 +/*
79437 + * Used in the idle loop; sti takes one instruction cycle
79438 + * to complete:
79439 + */
79440 +void raw_safe_halt(void);
79441 +
79442 +/*
79443 + * Used when interrupts are already enabled or to
79444 + * shutdown the processor:
79445 + */
79446 +void halt(void);
79447 +
79448 +static inline int raw_irqs_disabled_flags(unsigned long flags)
79449 +{
79450 +       return flags != 0;
79451 +}
79452 +
79453 +int raw_irqs_disabled(void);
79454 +
79455 +/*
79456 + * For spinlocks, etc:
79457 + */
79458 +unsigned long __raw_local_irq_save(void);
79459 +#define raw_local_irq_save(flags) \
79460 +               do { (flags) = __raw_local_irq_save(); } while (0)
79461 +
79462 +#endif /* __ASSEMBLY__ */
79463 +
79464 +/*
79465 + * Do the CPU's IRQ-state tracing from assembly code. We call a
79466 + * C function, so save all the C-clobbered registers:
79467 + */
79468 +#ifdef CONFIG_TRACE_IRQFLAGS
79469 +
79470 +# define TRACE_IRQS_ON                         \
79471 +       pushl %eax;                             \
79472 +       pushl %ecx;                             \
79473 +       pushl %edx;                             \
79474 +       call trace_hardirqs_on;                 \
79475 +       popl %edx;                              \
79476 +       popl %ecx;                              \
79477 +       popl %eax;
79478 +
79479 +# define TRACE_IRQS_OFF                                \
79480 +       pushl %eax;                             \
79481 +       pushl %ecx;                             \
79482 +       pushl %edx;                             \
79483 +       call trace_hardirqs_off;                \
79484 +       popl %edx;                              \
79485 +       popl %ecx;                              \
79486 +       popl %eax;
79487 +
79488 +#else
79489 +# define TRACE_IRQS_ON
79490 +# define TRACE_IRQS_OFF
79491 +#endif
79492 +
79493 +#endif
79494 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/kmap_types.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/kmap_types.h
79495 --- linux-2.6.19/include/asm-i386/mach-xen/asm/kmap_types.h     1970-01-01 00:00:00.000000000 +0000
79496 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/kmap_types.h   2007-02-02 19:10:55.000000000 +0000
79497 @@ -0,0 +1,31 @@
79498 +#ifndef _ASM_KMAP_TYPES_H
79499 +#define _ASM_KMAP_TYPES_H
79500 +
79501 +
79502 +#ifdef CONFIG_DEBUG_HIGHMEM
79503 +# define D(n) __KM_FENCE_##n ,
79504 +#else
79505 +# define D(n)
79506 +#endif
79507 +
79508 +enum km_type {
79509 +D(0)   KM_BOUNCE_READ,
79510 +D(1)   KM_SKB_SUNRPC_DATA,
79511 +D(2)   KM_SKB_DATA_SOFTIRQ,
79512 +D(3)   KM_USER0,
79513 +D(4)   KM_USER1,
79514 +D(5)   KM_BIO_SRC_IRQ,
79515 +D(6)   KM_BIO_DST_IRQ,
79516 +D(7)   KM_PTE0,
79517 +D(8)   KM_PTE1,
79518 +D(9)   KM_IRQ0,
79519 +D(10)  KM_IRQ1,
79520 +D(11)  KM_SOFTIRQ0,
79521 +D(12)  KM_SOFTIRQ1,
79522 +D(13)  KM_SWIOTLB,
79523 +D(14)  KM_TYPE_NR
79524 +};
79525 +
79526 +#undef D
79527 +
79528 +#endif
79529 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/maddr.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/maddr.h
79530 --- linux-2.6.19/include/asm-i386/mach-xen/asm/maddr.h  1970-01-01 00:00:00.000000000 +0000
79531 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/maddr.h        2007-02-02 19:10:55.000000000 +0000
79532 @@ -0,0 +1,176 @@
79533 +#ifndef _I386_MADDR_H
79534 +#define _I386_MADDR_H
79535 +
79536 +#include <xen/features.h>
79537 +#include <xen/interface/xen.h>
79538 +
79539 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
79540 +#define INVALID_P2M_ENTRY      (~0UL)
79541 +#define FOREIGN_FRAME_BIT      (1UL<<31)
79542 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
79543 +
79544 +/* Definitions for machine and pseudophysical addresses. */
79545 +#ifdef CONFIG_X86_PAE
79546 +typedef unsigned long long paddr_t;
79547 +typedef unsigned long long maddr_t;
79548 +#else
79549 +typedef unsigned long paddr_t;
79550 +typedef unsigned long maddr_t;
79551 +#endif
79552 +
79553 +#ifdef CONFIG_XEN
79554 +
79555 +extern unsigned long *phys_to_machine_mapping;
79556 +
79557 +#undef machine_to_phys_mapping
79558 +extern unsigned long *machine_to_phys_mapping;
79559 +extern unsigned int   machine_to_phys_order;
79560 +
79561 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
79562 +{
79563 +       if (xen_feature(XENFEAT_auto_translated_physmap))
79564 +               return pfn;
79565 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
79566 +               ~FOREIGN_FRAME_BIT;
79567 +}
79568 +
79569 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
79570 +{
79571 +       if (xen_feature(XENFEAT_auto_translated_physmap))
79572 +               return 1;
79573 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
79574 +}
79575 +
79576 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
79577 +{
79578 +       extern unsigned long max_mapnr;
79579 +       unsigned long pfn;
79580 +
79581 +       if (xen_feature(XENFEAT_auto_translated_physmap))
79582 +               return mfn;
79583 +
79584 +       if (unlikely((mfn >> machine_to_phys_order) != 0))
79585 +               return max_mapnr;
79586 +
79587 +       /* The array access can fail (e.g., device space beyond end of RAM). */
79588 +       asm (
79589 +               "1:     movl %1,%0\n"
79590 +               "2:\n"
79591 +               ".section .fixup,\"ax\"\n"
79592 +               "3:     movl %2,%0\n"
79593 +               "       jmp  2b\n"
79594 +               ".previous\n"
79595 +               ".section __ex_table,\"a\"\n"
79596 +               "       .align 4\n"
79597 +               "       .long 1b,3b\n"
79598 +               ".previous"
79599 +               : "=r" (pfn)
79600 +               : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
79601 +
79602 +       return pfn;
79603 +}
79604 +
79605 +/*
79606 + * We detect special mappings in one of two ways:
79607 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
79608 + *     to be outside our maximum possible pseudophys range.
79609 + *  2. If the MFN belongs to a different domain then we will certainly
79610 + *     not have MFN in our p2m table. Conversely, if the page is ours,
79611 + *     then we'll have p2m(m2p(MFN))==MFN.
79612 + * If we detect a special mapping then it doesn't have a 'struct page'.
79613 + * We force !pfn_valid() by returning an out-of-range pointer.
79614 + *
79615 + * NB. These checks require that, for any MFN that is not in our reservation,
79616 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
79617 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
79618 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
79619 + *
79620 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
79621 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
79622 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
79623 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
79624 + */
79625 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
79626 +{
79627 +       extern unsigned long max_mapnr;
79628 +       unsigned long pfn = mfn_to_pfn(mfn);
79629 +       if ((pfn < max_mapnr)
79630 +           && !xen_feature(XENFEAT_auto_translated_physmap)
79631 +           && (phys_to_machine_mapping[pfn] != mfn))
79632 +               return max_mapnr; /* force !pfn_valid() */
79633 +       return pfn;
79634 +}
79635 +
79636 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
79637 +{
79638 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
79639 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
79640 +               return;
79641 +       }
79642 +       phys_to_machine_mapping[pfn] = mfn;
79643 +}
79644 +
79645 +static inline maddr_t phys_to_machine(paddr_t phys)
79646 +{
79647 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
79648 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
79649 +       return machine;
79650 +}
79651 +
79652 +static inline paddr_t machine_to_phys(maddr_t machine)
79653 +{
79654 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
79655 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
79656 +       return phys;
79657 +}
79658 +
79659 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
79660 +{
79661 +       /*
79662 +        * In PAE mode, the NX bit needs to be dealt with in the value
79663 +        * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
79664 +        * but for i386 the conversion to ulong for the argument will
79665 +        * clip it off.
79666 +        */
79667 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
79668 +       phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
79669 +       return phys;
79670 +}
79671 +
79672 +#else /* !CONFIG_XEN */
79673 +
79674 +#define pfn_to_mfn(pfn) (pfn)
79675 +#define mfn_to_pfn(mfn) (mfn)
79676 +#define mfn_to_local_pfn(mfn) (mfn)
79677 +#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn))
79678 +#define phys_to_machine_mapping_valid(pfn) (1)
79679 +#define phys_to_machine(phys) ((maddr_t)(phys))
79680 +#define machine_to_phys(mach) ((paddr_t)(mach))
79681 +#define pte_machine_to_phys(mach) ((paddr_t)(mach))
79682 +
79683 +#endif /* !CONFIG_XEN */
79684 +
79685 +/* VIRT <-> MACHINE conversion */
79686 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
79687 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
79688 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
79689 +
79690 +#ifdef CONFIG_X86_PAE
79691 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
79692 +{
79693 +       pte_t pte;
79694 +
79695 +       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
79696 +                                       (pgprot_val(pgprot) >> 32);
79697 +       pte.pte_high &= (__supported_pte_mask >> 32);
79698 +       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
79699 +                                                       __supported_pte_mask;
79700 +       return pte;
79701 +}
79702 +#else
79703 +#define pfn_pte_ma(pfn, prot)  __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
79704 +#endif
79705 +
79706 +#define __pte_ma(x)    ((pte_t) { (x) } )
79707 +
79708 +#endif /* _I386_MADDR_H */
79709 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/mmu.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/mmu.h
79710 --- linux-2.6.19/include/asm-i386/mach-xen/asm/mmu.h    1970-01-01 00:00:00.000000000 +0000
79711 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/mmu.h  2007-02-02 19:10:55.000000000 +0000
79712 @@ -0,0 +1,29 @@
79713 +#ifndef __i386_MMU_H
79714 +#define __i386_MMU_H
79715 +
79716 +#include <asm/semaphore.h>
79717 +/*
79718 + * The i386 doesn't have a mmu context, but
79719 + * we put the segment information here.
79720 + *
79721 + * cpu_vm_mask is used to optimize ldt flushing.
79722 + */
79723 +typedef struct { 
79724 +       int size;
79725 +       struct semaphore sem;
79726 +       void *ldt;
79727 +       void *vdso;
79728 +#ifdef CONFIG_XEN
79729 +       int has_foreign_mappings;
79730 +#endif
79731 +} mm_context_t;
79732 +
79733 +/* mm/memory.c:exit_mmap hook */
79734 +extern void _arch_exit_mmap(struct mm_struct *mm);
79735 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
79736 +
79737 +/* kernel/fork.c:dup_mmap hook */
79738 +extern void _arch_dup_mmap(struct mm_struct *mm);
79739 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
79740 +
79741 +#endif
79742 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/mmu_context.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/mmu_context.h
79743 --- linux-2.6.19/include/asm-i386/mach-xen/asm/mmu_context.h    1970-01-01 00:00:00.000000000 +0000
79744 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/mmu_context.h  2007-02-02 19:10:55.000000000 +0000
79745 @@ -0,0 +1,108 @@
79746 +#ifndef __I386_SCHED_H
79747 +#define __I386_SCHED_H
79748 +
79749 +#include <asm/desc.h>
79750 +#include <asm/atomic.h>
79751 +#include <asm/pgalloc.h>
79752 +#include <asm/tlbflush.h>
79753 +
79754 +/*
79755 + * Used for LDT copy/destruction.
79756 + */
79757 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
79758 +void destroy_context(struct mm_struct *mm);
79759 +
79760 +
79761 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
79762 +{
79763 +#if 0 /* XEN: no lazy tlb */
79764 +       unsigned cpu = smp_processor_id();
79765 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
79766 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
79767 +#endif
79768 +}
79769 +
79770 +#define prepare_arch_switch(next)      __prepare_arch_switch()
79771 +
79772 +static inline void __prepare_arch_switch(void)
79773 +{
79774 +       /*
79775 +        * Save away %fs and %gs. No need to save %es and %ds, as those
79776 +        * are always kernel segments while inside the kernel. Must
79777 +        * happen before reload of cr3/ldt (i.e., not in __switch_to).
79778 +        */
79779 +       asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
79780 +               : "=m" (current->thread.fs),
79781 +                 "=m" (current->thread.gs));
79782 +       asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
79783 +               : : "r" (0) );
79784 +}
79785 +
79786 +extern void mm_pin(struct mm_struct *mm);
79787 +extern void mm_unpin(struct mm_struct *mm);
79788 +void mm_pin_all(void);
79789 +
79790 +static inline void switch_mm(struct mm_struct *prev,
79791 +                            struct mm_struct *next,
79792 +                            struct task_struct *tsk)
79793 +{
79794 +       int cpu = smp_processor_id();
79795 +       struct mmuext_op _op[2], *op = _op;
79796 +
79797 +       if (likely(prev != next)) {
79798 +               BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
79799 +                      !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
79800 +
79801 +               /* stop flush ipis for the previous mm */
79802 +               cpu_clear(cpu, prev->cpu_vm_mask);
79803 +#if 0 /* XEN: no lazy tlb */
79804 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
79805 +               per_cpu(cpu_tlbstate, cpu).active_mm = next;
79806 +#endif
79807 +               cpu_set(cpu, next->cpu_vm_mask);
79808 +
79809 +               /* Re-load page tables: load_cr3(next->pgd) */
79810 +               op->cmd = MMUEXT_NEW_BASEPTR;
79811 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
79812 +               op++;
79813 +
79814 +               /*
79815 +                * load the LDT, if the LDT is different:
79816 +                */
79817 +               if (unlikely(prev->context.ldt != next->context.ldt)) {
79818 +                       /* load_LDT_nolock(&next->context, cpu) */
79819 +                       op->cmd = MMUEXT_SET_LDT;
79820 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
79821 +                       op->arg2.nr_ents     = next->context.size;
79822 +                       op++;
79823 +               }
79824 +
79825 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
79826 +       }
79827 +#if 0 /* XEN: no lazy tlb */
79828 +       else {
79829 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
79830 +               BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
79831 +
79832 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
79833 +                       /* We were in lazy tlb mode and leave_mm disabled 
79834 +                        * tlb flush IPI delivery. We must reload %cr3.
79835 +                        */
79836 +                       load_cr3(next->pgd);
79837 +                       load_LDT_nolock(&next->context, cpu);
79838 +               }
79839 +       }
79840 +#endif
79841 +}
79842 +
79843 +#define deactivate_mm(tsk, mm) \
79844 +       asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
79845 +
79846 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
79847 +{
79848 +       if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
79849 +               mm_pin(next);
79850 +       switch_mm(prev, next, NULL);
79851 +}
79852 +
79853 +#endif
79854 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/page.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/page.h
79855 --- linux-2.6.19/include/asm-i386/mach-xen/asm/page.h   1970-01-01 00:00:00.000000000 +0000
79856 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/page.h 2007-02-02 19:10:55.000000000 +0000
79857 @@ -0,0 +1,231 @@
79858 +#ifndef _I386_PAGE_H
79859 +#define _I386_PAGE_H
79860 +
79861 +/* PAGE_SHIFT determines the page size */
79862 +#define PAGE_SHIFT     12
79863 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
79864 +#define PAGE_MASK      (~(PAGE_SIZE-1))
79865 +
79866 +#ifdef CONFIG_X86_PAE
79867 +#define __PHYSICAL_MASK_SHIFT  36
79868 +#define __PHYSICAL_MASK                ((1ULL << __PHYSICAL_MASK_SHIFT) - 1)
79869 +#define PHYSICAL_PAGE_MASK     (~((1ULL << PAGE_SHIFT) - 1) & __PHYSICAL_MASK)
79870 +#else
79871 +#define __PHYSICAL_MASK_SHIFT  32
79872 +#define __PHYSICAL_MASK                (~0UL)
79873 +#define PHYSICAL_PAGE_MASK     (PAGE_MASK & __PHYSICAL_MASK)
79874 +#endif
79875 +
79876 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
79877 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
79878 +
79879 +#ifdef __KERNEL__
79880 +#ifndef __ASSEMBLY__
79881 +
79882 +#include <linux/string.h>
79883 +#include <linux/types.h>
79884 +#include <linux/kernel.h>
79885 +#include <asm/bug.h>
79886 +#include <xen/interface/xen.h>
79887 +#include <xen/features.h>
79888 +#include <xen/foreign_page.h>
79889 +
79890 +#define arch_free_page(_page,_order)                   \
79891 +({     int foreign = PageForeign(_page);               \
79892 +       if (foreign)                                    \
79893 +               (PageForeignDestructor(_page))(_page);  \
79894 +       foreign;                                        \
79895 +})
79896 +#define HAVE_ARCH_FREE_PAGE
79897 +
79898 +#ifdef CONFIG_XEN_SCRUB_PAGES
79899 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
79900 +#else
79901 +#define scrub_pages(_p,_n) ((void)0)
79902 +#endif
79903 +
79904 +#ifdef CONFIG_X86_USE_3DNOW
79905 +
79906 +#include <asm/mmx.h>
79907 +
79908 +#define clear_page(page)       mmx_clear_page((void *)(page))
79909 +#define copy_page(to,from)     mmx_copy_page(to,from)
79910 +
79911 +#else
79912 +
79913 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
79914 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
79915 +
79916 +/*
79917 + *     On older X86 processors it's not a win to use MMX here it seems.
79918 + *     Maybe the K6-III ?
79919 + */
79920
79921 +#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
79922 +#define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
79923 +
79924 +#endif
79925 +
79926 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
79927 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
79928 +
79929 +/*
79930 + * These are used to make use of C type-checking..
79931 + */
79932 +extern int nx_enabled;
79933 +#ifdef CONFIG_X86_PAE
79934 +extern unsigned long long __supported_pte_mask;
79935 +typedef struct { unsigned long pte_low, pte_high; } pte_t;
79936 +typedef struct { unsigned long long pmd; } pmd_t;
79937 +typedef struct { unsigned long long pgd; } pgd_t;
79938 +typedef struct { unsigned long long pgprot; } pgprot_t;
79939 +#define pgprot_val(x)  ((x).pgprot)
79940 +#include <asm/maddr.h>
79941 +#define __pte(x) ({ unsigned long long _x = (x);        \
79942 +    if (_x & 1) _x = phys_to_machine(_x);               \
79943 +    ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
79944 +#define __pgd(x) ({ unsigned long long _x = (x); \
79945 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
79946 +#define __pmd(x) ({ unsigned long long _x = (x); \
79947 +    (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
79948 +static inline unsigned long long pte_val(pte_t x)
79949 +{
79950 +       unsigned long long ret;
79951 +
79952 +       if (x.pte_low) {
79953 +               ret = x.pte_low | (unsigned long long)x.pte_high << 32;
79954 +               ret = pte_machine_to_phys(ret) | 1;
79955 +       } else {
79956 +               ret = 0;
79957 +       }
79958 +       return ret;
79959 +}
79960 +static inline unsigned long long pmd_val(pmd_t x)
79961 +{
79962 +       unsigned long long ret = x.pmd;
79963 +       if (ret) ret = pte_machine_to_phys(ret) | 1;
79964 +       return ret;
79965 +}
79966 +static inline unsigned long long pgd_val(pgd_t x)
79967 +{
79968 +       unsigned long long ret = x.pgd;
79969 +       if (ret) ret = pte_machine_to_phys(ret) | 1;
79970 +       return ret;
79971 +}
79972 +static inline unsigned long long pte_val_ma(pte_t x)
79973 +{
79974 +       return (unsigned long long)x.pte_high << 32 | x.pte_low;
79975 +}
79976 +#define HPAGE_SHIFT    21
79977 +#else
79978 +typedef struct { unsigned long pte_low; } pte_t;
79979 +typedef struct { unsigned long pgd; } pgd_t;
79980 +typedef struct { unsigned long pgprot; } pgprot_t;
79981 +#define pgprot_val(x)  ((x).pgprot)
79982 +#include <asm/maddr.h>
79983 +#define boot_pte_t pte_t /* or would you rather have a typedef */
79984 +#define pte_val(x)     (((x).pte_low & 1) ? \
79985 +                        pte_machine_to_phys((x).pte_low) : \
79986 +                        (x).pte_low)
79987 +#define pte_val_ma(x)  ((x).pte_low)
79988 +#define __pte(x) ({ unsigned long _x = (x); \
79989 +    (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
79990 +#define __pgd(x) ({ unsigned long _x = (x); \
79991 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
79992 +static inline unsigned long pgd_val(pgd_t x)
79993 +{
79994 +       unsigned long ret = x.pgd;
79995 +       if (ret) ret = pte_machine_to_phys(ret) | 1;
79996 +       return ret;
79997 +}
79998 +#define HPAGE_SHIFT    22
79999 +#endif
80000 +#define PTE_MASK       PAGE_MASK
80001 +
80002 +#ifdef CONFIG_HUGETLB_PAGE
80003 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
80004 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
80005 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
80006 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
80007 +#endif
80008 +
80009 +#define __pgprot(x)    ((pgprot_t) { (x) } )
80010 +
80011 +#endif /* !__ASSEMBLY__ */
80012 +
80013 +/* to align the pointer to the (next) page boundary */
80014 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
80015 +
80016 +/*
80017 + * This handles the memory map.. We could make this a config
80018 + * option, but too many people screw it up, and too few need
80019 + * it.
80020 + *
80021 + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
80022 + * a virtual address space of one gigabyte, which limits the
80023 + * amount of physical memory you can use to about 950MB. 
80024 + *
80025 + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
80026 + * and CONFIG_HIGHMEM64G options in the kernel configuration.
80027 + */
80028 +
80029 +#ifndef __ASSEMBLY__
80030 +
80031 +struct vm_area_struct;
80032 +
80033 +/*
80034 + * This much address space is reserved for vmalloc() and iomap()
80035 + * as well as fixmap mappings.
80036 + */
80037 +extern unsigned int __VMALLOC_RESERVE;
80038 +
80039 +extern int sysctl_legacy_va_layout;
80040 +
80041 +extern int page_is_ram(unsigned long pagenr);
80042 +
80043 +#endif /* __ASSEMBLY__ */
80044 +
80045 +#ifdef __ASSEMBLY__
80046 +#define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
80047 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
80048 +#else
80049 +#define __PAGE_OFFSET          ((unsigned long)CONFIG_PAGE_OFFSET)
80050 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
80051 +#endif
80052 +#define __KERNEL_START         (__PAGE_OFFSET + __PHYSICAL_START)
80053 +
80054 +#ifdef CONFIG_XEN_COMPAT_030002
80055 +#undef LOAD_OFFSET
80056 +#define LOAD_OFFSET            0
80057 +#endif /* CONFIG_XEN_COMPAT_030002 */
80058 +
80059 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
80060 +#define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
80061 +#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
80062 +#define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
80063 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
80064 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
80065 +#ifdef CONFIG_FLATMEM
80066 +#define pfn_valid(pfn)         ((pfn) < max_mapnr)
80067 +#endif /* CONFIG_FLATMEM */
80068 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
80069 +
80070 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
80071 +
80072 +#define VM_DATA_DEFAULT_FLAGS \
80073 +       (VM_READ | VM_WRITE | \
80074 +       ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
80075 +                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
80076 +
80077 +/* VIRT <-> MACHINE conversion */
80078 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
80079 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
80080 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
80081 +
80082 +#include <asm-generic/memory_model.h>
80083 +#include <asm-generic/page.h>
80084 +
80085 +#define __HAVE_ARCH_GATE_AREA 1
80086 +#endif /* __KERNEL__ */
80087 +
80088 +#endif /* _I386_PAGE_H */
80089 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/pci.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pci.h
80090 --- linux-2.6.19/include/asm-i386/mach-xen/asm/pci.h    1970-01-01 00:00:00.000000000 +0000
80091 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pci.h  2007-02-02 19:10:55.000000000 +0000
80092 @@ -0,0 +1,153 @@
80093 +#ifndef __i386_PCI_H
80094 +#define __i386_PCI_H
80095 +
80096 +
80097 +#ifdef __KERNEL__
80098 +#include <linux/mm.h>          /* for struct page */
80099 +
80100 +/* Can be used to override the logic in pci_scan_bus for skipping
80101 +   already-configured bus numbers - to be used for buggy BIOSes
80102 +   or architectures with incomplete PCI setup by the loader */
80103 +
80104 +#ifdef CONFIG_PCI
80105 +extern unsigned int pcibios_assign_all_busses(void);
80106 +#else
80107 +#define pcibios_assign_all_busses()    0
80108 +#endif
80109 +#define pcibios_scan_all_fns(a, b)     0
80110 +
80111 +extern unsigned long pci_mem_start;
80112 +#define PCIBIOS_MIN_IO         0x1000
80113 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
80114 +
80115 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
80116 +
80117 +void pcibios_config_init(void);
80118 +struct pci_bus * pcibios_scan_root(int bus);
80119 +
80120 +void pcibios_set_master(struct pci_dev *dev);
80121 +void pcibios_penalize_isa_irq(int irq, int active);
80122 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
80123 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
80124 +
80125 +/* Dynamic DMA mapping stuff.
80126 + * i386 has everything mapped statically.
80127 + */
80128 +
80129 +#include <linux/types.h>
80130 +#include <linux/slab.h>
80131 +#include <asm/scatterlist.h>
80132 +#include <linux/string.h>
80133 +#include <asm/io.h>
80134 +
80135 +struct pci_dev;
80136 +
80137 +#ifdef CONFIG_SWIOTLB
80138 +
80139 +
80140 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
80141 +#define PCI_DMA_BUS_IS_PHYS    (0)
80142 +
80143 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
80144 +       dma_addr_t ADDR_NAME;
80145 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
80146 +       __u32 LEN_NAME;
80147 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
80148 +       ((PTR)->ADDR_NAME)
80149 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
80150 +       (((PTR)->ADDR_NAME) = (VAL))
80151 +#define pci_unmap_len(PTR, LEN_NAME)                   \
80152 +       ((PTR)->LEN_NAME)
80153 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
80154 +       (((PTR)->LEN_NAME) = (VAL))
80155 +
80156 +#else
80157 +
80158 +/* The PCI address space does equal the physical memory
80159 + * address space.  The networking and block device layers use
80160 + * this boolean for bounce buffer decisions.
80161 + */
80162 +#define PCI_DMA_BUS_IS_PHYS    (1)
80163 +
80164 +/* pci_unmap_{page,single} is a nop so... */
80165 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
80166 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
80167 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
80168 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
80169 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
80170 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
80171 +
80172 +#endif
80173 +
80174 +/* This is always fine. */
80175 +#define pci_dac_dma_supported(pci_dev, mask)   (1)
80176 +
80177 +static inline dma64_addr_t
80178 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
80179 +{
80180 +       return ((dma64_addr_t) page_to_phys(page) +
80181 +               (dma64_addr_t) offset);
80182 +}
80183 +
80184 +static inline struct page *
80185 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
80186 +{
80187 +       return pfn_to_page(dma_addr >> PAGE_SHIFT);
80188 +}
80189 +
80190 +static inline unsigned long
80191 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
80192 +{
80193 +       return (dma_addr & ~PAGE_MASK);
80194 +}
80195 +
80196 +static inline void
80197 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
80198 +{
80199 +}
80200 +
80201 +static inline void
80202 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
80203 +{
80204 +       flush_write_buffers();
80205 +}
80206 +
80207 +#define HAVE_PCI_MMAP
80208 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
80209 +                              enum pci_mmap_state mmap_state, int write_combine);
80210 +
80211 +
80212 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
80213 +{
80214 +}
80215 +
80216 +#ifdef CONFIG_PCI
80217 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
80218 +                                       enum pci_dma_burst_strategy *strat,
80219 +                                       unsigned long *strategy_parameter)
80220 +{
80221 +       *strat = PCI_DMA_BURST_INFINITY;
80222 +       *strategy_parameter = ~0UL;
80223 +}
80224 +#endif
80225 +
80226 +#endif /* __KERNEL__ */
80227 +
80228 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
80229 +#include <xen/pcifront.h>
80230 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
80231 +
80232 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
80233 +#include <asm-generic/pci-dma-compat.h>
80234 +
80235 +/* generic pci stuff */
80236 +#include <asm-generic/pci.h>
80237 +
80238 +/* On Xen we have to scan all functions since Xen hides bridges from
80239 + * us.  If a bridge is at fn=0 and that slot has a multifunction
80240 + * device, we won't find the additional devices without scanning all
80241 + * functions. */
80242 +#undef pcibios_scan_all_fns
80243 +#define pcibios_scan_all_fns(a, b)     1
80244 +
80245 +#endif /* __i386_PCI_H */
80246 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/pgalloc.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgalloc.h
80247 --- linux-2.6.19/include/asm-i386/mach-xen/asm/pgalloc.h        1970-01-01 00:00:00.000000000 +0000
80248 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgalloc.h      2007-02-02 19:10:55.000000000 +0000
80249 @@ -0,0 +1,63 @@
80250 +#ifndef _I386_PGALLOC_H
80251 +#define _I386_PGALLOC_H
80252 +
80253 +#include <asm/fixmap.h>
80254 +#include <linux/threads.h>
80255 +#include <linux/mm.h>          /* for struct page */
80256 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
80257 +
80258 +/* Is this pagetable pinned? */
80259 +#define PG_pinned      PG_arch_1
80260 +
80261 +#define pmd_populate_kernel(mm, pmd, pte) \
80262 +               set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
80263 +
80264 +#define pmd_populate(mm, pmd, pte)                                     \
80265 +do {                                                                   \
80266 +       if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {     \
80267 +               if (!PageHighMem(pte))                                  \
80268 +                       BUG_ON(HYPERVISOR_update_va_mapping(            \
80269 +                         (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
80270 +                         pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));\
80271 +               set_pmd(pmd, __pmd(_PAGE_TABLE +                        \
80272 +                       ((unsigned long long)page_to_pfn(pte) <<        \
80273 +                               (unsigned long long) PAGE_SHIFT)));     \
80274 +       } else {                                                        \
80275 +               *(pmd) = __pmd(_PAGE_TABLE +                            \
80276 +                       ((unsigned long long)page_to_pfn(pte) <<        \
80277 +                               (unsigned long long) PAGE_SHIFT));      \
80278 +       }                                                               \
80279 +} while (0)
80280 +
80281 +/*
80282 + * Allocate and free page tables.
80283 + */
80284 +extern pgd_t *pgd_alloc(struct mm_struct *);
80285 +extern void pgd_free(pgd_t *pgd);
80286 +
80287 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
80288 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
80289 +
80290 +static inline void pte_free_kernel(pte_t *pte)
80291 +{
80292 +       free_page((unsigned long)pte);
80293 +       make_page_writable(pte, XENFEAT_writable_page_tables);
80294 +}
80295 +
80296 +extern void pte_free(struct page *pte);
80297 +
80298 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
80299 +
80300 +#ifdef CONFIG_X86_PAE
80301 +/*
80302 + * In the PAE case we free the pmds as part of the pgd.
80303 + */
80304 +#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
80305 +#define pmd_free(x)                    do { } while (0)
80306 +#define __pmd_free_tlb(tlb,x)          do { } while (0)
80307 +#define pud_populate(mm, pmd, pte)     BUG()
80308 +#endif
80309 +
80310 +#define check_pgt_cache()      do { } while (0)
80311 +
80312 +#endif /* _I386_PGALLOC_H */
80313 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable-2level.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable-2level.h
80314 --- linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable-2level.h 1970-01-01 00:00:00.000000000 +0000
80315 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable-2level.h       2007-02-02 19:10:55.000000000 +0000
80316 @@ -0,0 +1,78 @@
80317 +#ifndef _I386_PGTABLE_2LEVEL_H
80318 +#define _I386_PGTABLE_2LEVEL_H
80319 +
80320 +#include <asm-generic/pgtable-nopmd.h>
80321 +
80322 +#define pte_ERROR(e) \
80323 +       printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
80324 +#define pgd_ERROR(e) \
80325 +       printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
80326 +
80327 +/*
80328 + * Certain architectures need to do special things when PTEs
80329 + * within a page table are directly modified.  Thus, the following
80330 + * hook is made available.
80331 + */
80332 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
80333 +
80334 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
80335 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
80336 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
80337 +               set_pte((ptep), (pteval));                              \
80338 +} while (0)
80339 +
80340 +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
80341 +#define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval)
80342 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
80343 +
80344 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
80345 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
80346 +
80347 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
80348 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0))
80349 +
80350 +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
80351 +#define pte_page(x) pfn_to_page(pte_pfn(x))
80352 +#define pte_none(x)            (!(x).pte_low)
80353 +#define pte_pfn(x) mfn_to_local_pfn(pte_mfn(x))
80354 +#define pfn_pte(pfn, prot)     __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
80355 +#define pfn_pmd(pfn, prot)     __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
80356 +
80357 +/*
80358 + * All present user pages are user-executable:
80359 + */
80360 +static inline int pte_exec(pte_t pte)
80361 +{
80362 +       return pte_user(pte);
80363 +}
80364 +
80365 +/*
80366 + * All present pages are kernel-executable:
80367 + */
80368 +static inline int pte_exec_kernel(pte_t pte)
80369 +{
80370 +       return 1;
80371 +}
80372 +
80373 +/*
80374 + * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
80375 + * into this range:
80376 + */
80377 +#define PTE_FILE_MAX_BITS      29
80378 +
80379 +#define pte_to_pgoff(pte) \
80380 +       ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
80381 +
80382 +#define pgoff_to_pte(off) \
80383 +       ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
80384 +
80385 +/* Encode and de-code a swap entry */
80386 +#define __swp_type(x)                  (((x).val >> 1) & 0x1f)
80387 +#define __swp_offset(x)                        ((x).val >> 8)
80388 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
80389 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { (pte).pte_low })
80390 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
80391 +
80392 +void vmalloc_sync_all(void);
80393 +
80394 +#endif /* _I386_PGTABLE_2LEVEL_H */
80395 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h
80396 --- linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h    1970-01-01 00:00:00.000000000 +0000
80397 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h  2007-02-02 19:10:55.000000000 +0000
80398 @@ -0,0 +1,24 @@
80399 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
80400 +#define _I386_PGTABLE_3LEVEL_DEFS_H
80401 +
80402 +#define HAVE_SHARED_KERNEL_PMD 0
80403 +
80404 +/*
80405 + * PGDIR_SHIFT determines what a top-level page table entry can map
80406 + */
80407 +#define PGDIR_SHIFT    30
80408 +#define PTRS_PER_PGD   4
80409 +
80410 +/*
80411 + * PMD_SHIFT determines the size of the area a middle-level
80412 + * page table can map
80413 + */
80414 +#define PMD_SHIFT      21
80415 +#define PTRS_PER_PMD   512
80416 +
80417 +/*
80418 + * entries per page directory level
80419 + */
80420 +#define PTRS_PER_PTE   512
80421 +
80422 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
80423 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable-3level.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable-3level.h
80424 --- linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable-3level.h 1970-01-01 00:00:00.000000000 +0000
80425 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable-3level.h       2007-02-02 19:10:55.000000000 +0000
80426 @@ -0,0 +1,195 @@
80427 +#ifndef _I386_PGTABLE_3LEVEL_H
80428 +#define _I386_PGTABLE_3LEVEL_H
80429 +
80430 +#include <asm-generic/pgtable-nopud.h>
80431 +
80432 +/*
80433 + * Intel Physical Address Extension (PAE) Mode - three-level page
80434 + * tables on PPro+ CPUs.
80435 + *
80436 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
80437 + */
80438 +
80439 +#define pte_ERROR(e) \
80440 +       printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
80441 +#define pmd_ERROR(e) \
80442 +       printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
80443 +#define pgd_ERROR(e) \
80444 +       printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
80445 +
80446 +#define pud_none(pud)                          0
80447 +#define pud_bad(pud)                           0
80448 +#define pud_present(pud)                       1
80449 +
80450 +/*
80451 + * Is the pte executable?
80452 + */
80453 +static inline int pte_x(pte_t pte)
80454 +{
80455 +       return !(pte_val(pte) & _PAGE_NX);
80456 +}
80457 +
80458 +/*
80459 + * All present user-pages with !NX bit are user-executable:
80460 + */
80461 +static inline int pte_exec(pte_t pte)
80462 +{
80463 +       return pte_user(pte) && pte_x(pte);
80464 +}
80465 +/*
80466 + * All present pages with !NX bit are kernel-executable:
80467 + */
80468 +static inline int pte_exec_kernel(pte_t pte)
80469 +{
80470 +       return pte_x(pte);
80471 +}
80472 +
80473 +/* Rules for using set_pte: the pte being assigned *must* be
80474 + * either not present or in a state where the hardware will
80475 + * not attempt to update the pte.  In places where this is
80476 + * not possible, use pte_get_and_clear to obtain the old pte
80477 + * value and then use set_pte to update it.  -ben
80478 + */
80479 +#define __HAVE_ARCH_SET_PTE_ATOMIC
80480 +
80481 +#if 1
80482 +/* use writable pagetables */
80483 +static inline void set_pte(pte_t *ptep, pte_t pte)
80484 +{
80485 +       ptep->pte_high = pte.pte_high;
80486 +       smp_wmb();
80487 +       ptep->pte_low = pte.pte_low;
80488 +}
80489 +# define set_pte_atomic(pteptr,pteval) \
80490 +               set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval))
80491 +
80492 +/*
80493 + * Since this is only called on user PTEs, and the page fault handler
80494 + * must handle the already racy situation of simultaneous page faults,
80495 + * we are justified in merely clearing the PTE present bit, followed
80496 + * by a set.  The ordering here is important.
80497 + */
80498 +static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
80499 +{
80500 +       ptep->pte_low = 0;
80501 +       smp_wmb();
80502 +       ptep->pte_high = pte.pte_high;
80503 +       smp_wmb();
80504 +       ptep->pte_low = pte.pte_low;
80505 +}
80506 +#else
80507 +/* no writable pagetables */
80508 +# define set_pte(pteptr,pteval)                                \
80509 +               xen_l1_entry_update((pteptr), (pteval))
80510 +# define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval)
80511 +# define set_pte_pressent(mm,addr,ptep,pte) set_pte_at(mm,addr,ptep,pteval)
80512 +#endif
80513 +
80514 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
80515 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
80516 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
80517 +               set_pte((ptep), (pteval));                              \
80518 +} while (0)
80519 +
80520 +#define set_pmd(pmdptr,pmdval)                         \
80521 +               xen_l2_entry_update((pmdptr), (pmdval))
80522 +#define set_pud(pudptr,pudval) \
80523 +               xen_l3_entry_update((pudptr), (pudval))
80524 +
80525 +/*
80526 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
80527 + * the TLB via cr3 if the top-level pgd is changed...
80528 + * We do not let the generic code free and clear pgd entries due to
80529 + * this erratum.
80530 + */
80531 +static inline void pud_clear (pud_t * pud) { }
80532 +
80533 +#define pud_page(pud) \
80534 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
80535 +
80536 +#define pud_page_vaddr(pud) \
80537 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
80538 +
80539 +
80540 +/* Find an entry in the second-level page table.. */
80541 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
80542 +                       pmd_index(address))
80543 +
80544 +/*
80545 + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
80546 + * entry, so clear the bottom half first and enforce ordering with a compiler
80547 + * barrier.
80548 + */
80549 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
80550 +{
80551 +       ptep->pte_low = 0;
80552 +       smp_wmb();
80553 +       ptep->pte_high = 0;
80554 +}
80555 +
80556 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
80557 +
80558 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
80559 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
80560 +{
80561 +       pte_t res;
80562 +
80563 +       /* xchg acts as a barrier before the setting of the high bits */
80564 +       res.pte_low = xchg(&ptep->pte_low, 0);
80565 +       res.pte_high = ptep->pte_high;
80566 +       ptep->pte_high = 0;
80567 +
80568 +       return res;
80569 +}
80570 +
80571 +#define __HAVE_ARCH_PTE_SAME
80572 +static inline int pte_same(pte_t a, pte_t b)
80573 +{
80574 +       return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
80575 +}
80576 +
80577 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
80578 +
80579 +static inline int pte_none(pte_t pte)
80580 +{
80581 +       return !pte.pte_low && !pte.pte_high;
80582 +}
80583 +
80584 +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
80585 +                      (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
80586 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
80587 +
80588 +extern unsigned long long __supported_pte_mask;
80589 +
80590 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
80591 +{
80592 +       return pfn_pte_ma(pfn_to_mfn(page_nr), pgprot);
80593 +}
80594 +
80595 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
80596 +{
80597 +       BUG(); panic("needs review");
80598 +       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \
80599 +                       pgprot_val(pgprot)) & __supported_pte_mask);
80600 +}
80601 +
80602 +/*
80603 + * Bits 0, 6 and 7 are taken in the low part of the pte,
80604 + * put the 32 bits of offset into the high part.
80605 + */
80606 +#define pte_to_pgoff(pte) ((pte).pte_high)
80607 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
80608 +#define PTE_FILE_MAX_BITS       32
80609 +
80610 +/* Encode and de-code a swap entry */
80611 +#define __swp_type(x)                  (((x).val) & 0x1f)
80612 +#define __swp_offset(x)                        ((x).val >> 5)
80613 +#define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
80614 +#define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
80615 +#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
80616 +
80617 +#define __pmd_free_tlb(tlb, x)         do { } while (0)
80618 +
80619 +#define vmalloc_sync_all() ((void)0)
80620 +
80621 +#endif /* _I386_PGTABLE_3LEVEL_H */
80622 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable.h
80623 --- linux-2.6.19/include/asm-i386/mach-xen/asm/pgtable.h        1970-01-01 00:00:00.000000000 +0000
80624 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/pgtable.h      2007-02-02 19:10:55.000000000 +0000
80625 @@ -0,0 +1,551 @@
80626 +#ifndef _I386_PGTABLE_H
80627 +#define _I386_PGTABLE_H
80628 +
80629 +#include <asm/hypervisor.h>
80630 +
80631 +/*
80632 + * The Linux memory management assumes a three-level page table setup. On
80633 + * the i386, we use that, but "fold" the mid level into the top-level page
80634 + * table, so that we physically have the same two-level page table as the
80635 + * i386 mmu expects.
80636 + *
80637 + * This file contains the functions and defines necessary to modify and use
80638 + * the i386 page table tree.
80639 + */
80640 +#ifndef __ASSEMBLY__
80641 +#include <asm/processor.h>
80642 +#include <asm/fixmap.h>
80643 +#include <linux/threads.h>
80644 +
80645 +#ifndef _I386_BITOPS_H
80646 +#include <asm/bitops.h>
80647 +#endif
80648 +
80649 +#include <linux/slab.h>
80650 +#include <linux/list.h>
80651 +#include <linux/spinlock.h>
80652 +
80653 +struct mm_struct;
80654 +struct vm_area_struct;
80655 +
80656 +/*
80657 + * ZERO_PAGE is a global shared page that is always zero: used
80658 + * for zero-mapped memory areas etc..
80659 + */
80660 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
80661 +extern unsigned long empty_zero_page[1024];
80662 +extern pgd_t *swapper_pg_dir;
80663 +extern kmem_cache_t *pgd_cache;
80664 +extern kmem_cache_t *pmd_cache;
80665 +extern spinlock_t pgd_lock;
80666 +extern struct page *pgd_list;
80667 +
80668 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
80669 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
80670 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
80671 +void pgtable_cache_init(void);
80672 +void paging_init(void);
80673 +
80674 +/*
80675 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
80676 + * implements both the traditional 2-level x86 page tables and the
80677 + * newer 3-level PAE-mode page tables.
80678 + */
80679 +#ifdef CONFIG_X86_PAE
80680 +# include <asm/pgtable-3level-defs.h>
80681 +# define PMD_SIZE      (1UL << PMD_SHIFT)
80682 +# define PMD_MASK      (~(PMD_SIZE-1))
80683 +#else
80684 +# include <asm/pgtable-2level-defs.h>
80685 +#endif
80686 +
80687 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
80688 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
80689 +
80690 +#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
80691 +#define FIRST_USER_ADDRESS     0
80692 +
80693 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
80694 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
80695 +
80696 +#define TWOLEVEL_PGDIR_SHIFT   22
80697 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
80698 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
80699 +
80700 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
80701 + * current 8MB value just means that there will be a 8MB "hole" after the
80702 + * physical memory until the kernel virtual memory starts.  That means that
80703 + * any out-of-bounds memory accesses will hopefully be caught.
80704 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
80705 + * area for the same reason. ;)
80706 + */
80707 +#define VMALLOC_OFFSET (8*1024*1024)
80708 +#define VMALLOC_START  (((unsigned long) high_memory + vmalloc_earlyreserve + \
80709 +                       2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
80710 +#ifdef CONFIG_HIGHMEM
80711 +# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
80712 +#else
80713 +# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
80714 +#endif
80715 +
80716 +/*
80717 + * _PAGE_PSE set in the page directory entry just means that
80718 + * the page directory entry points directly to a 4MB-aligned block of
80719 + * memory. 
80720 + */
80721 +#define _PAGE_BIT_PRESENT      0
80722 +#define _PAGE_BIT_RW           1
80723 +#define _PAGE_BIT_USER         2
80724 +#define _PAGE_BIT_PWT          3
80725 +#define _PAGE_BIT_PCD          4
80726 +#define _PAGE_BIT_ACCESSED     5
80727 +#define _PAGE_BIT_DIRTY                6
80728 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
80729 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
80730 +#define _PAGE_BIT_UNUSED1      9       /* available for programmer */
80731 +#define _PAGE_BIT_UNUSED2      10
80732 +#define _PAGE_BIT_UNUSED3      11
80733 +#define _PAGE_BIT_NX           63
80734 +
80735 +#define _PAGE_PRESENT  0x001
80736 +#define _PAGE_RW       0x002
80737 +#define _PAGE_USER     0x004
80738 +#define _PAGE_PWT      0x008
80739 +#define _PAGE_PCD      0x010
80740 +#define _PAGE_ACCESSED 0x020
80741 +#define _PAGE_DIRTY    0x040
80742 +#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
80743 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
80744 +#define _PAGE_UNUSED1  0x200   /* available for programmer */
80745 +#define _PAGE_UNUSED2  0x400
80746 +#define _PAGE_UNUSED3  0x800
80747 +
80748 +/* If _PAGE_PRESENT is clear, we use these: */
80749 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
80750 +#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
80751 +                                  pte_present gives true */
80752 +#ifdef CONFIG_X86_PAE
80753 +#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
80754 +#else
80755 +#define _PAGE_NX       0
80756 +#endif
80757 +
80758 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
80759 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
80760 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
80761 +
80762 +#define PAGE_NONE \
80763 +       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
80764 +#define PAGE_SHARED \
80765 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
80766 +
80767 +#define PAGE_SHARED_EXEC \
80768 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
80769 +#define PAGE_COPY_NOEXEC \
80770 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
80771 +#define PAGE_COPY_EXEC \
80772 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
80773 +#define PAGE_COPY \
80774 +       PAGE_COPY_NOEXEC
80775 +#define PAGE_READONLY \
80776 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
80777 +#define PAGE_READONLY_EXEC \
80778 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
80779 +
80780 +#define _PAGE_KERNEL \
80781 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
80782 +#define _PAGE_KERNEL_EXEC \
80783 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
80784 +
80785 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
80786 +#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
80787 +#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
80788 +#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
80789 +#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
80790 +
80791 +#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
80792 +#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
80793 +#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
80794 +#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
80795 +#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
80796 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
80797 +
80798 +/*
80799 + * The i386 can't do page protection for execute, and considers that
80800 + * the same are read. Also, write permissions imply read permissions.
80801 + * This is the closest we can get..
80802 + */
80803 +#define __P000 PAGE_NONE
80804 +#define __P001 PAGE_READONLY
80805 +#define __P010 PAGE_COPY
80806 +#define __P011 PAGE_COPY
80807 +#define __P100 PAGE_READONLY_EXEC
80808 +#define __P101 PAGE_READONLY_EXEC
80809 +#define __P110 PAGE_COPY_EXEC
80810 +#define __P111 PAGE_COPY_EXEC
80811 +
80812 +#define __S000 PAGE_NONE
80813 +#define __S001 PAGE_READONLY
80814 +#define __S010 PAGE_SHARED
80815 +#define __S011 PAGE_SHARED
80816 +#define __S100 PAGE_READONLY_EXEC
80817 +#define __S101 PAGE_READONLY_EXEC
80818 +#define __S110 PAGE_SHARED_EXEC
80819 +#define __S111 PAGE_SHARED_EXEC
80820 +
80821 +/*
80822 + * Define this if things work differently on an i386 and an i486:
80823 + * it will (on an i486) warn about kernel memory accesses that are
80824 + * done without a 'access_ok(VERIFY_WRITE,..)'
80825 + */
80826 +#undef TEST_ACCESS_OK
80827 +
80828 +/* The boot page tables (all created as a single array) */
80829 +extern unsigned long pg0[];
80830 +
80831 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
80832 +
80833 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
80834 +#define pmd_none(x)    (!(unsigned long)pmd_val(x))
80835 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
80836 +   can temporarily clear it. */
80837 +#define pmd_present(x) (pmd_val(x))
80838 +#define pmd_bad(x)     ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
80839 +
80840 +
80841 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
80842 +
80843 +/*
80844 + * The following only work if pte_present() is true.
80845 + * Undefined behaviour if not..
80846 + */
80847 +static inline int pte_user(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
80848 +static inline int pte_read(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
80849 +static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
80850 +static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
80851 +static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
80852 +static inline int pte_huge(pte_t pte)          { return (pte).pte_low & _PAGE_PSE; }
80853 +
80854 +/*
80855 + * The following only works if pte_present() is not true.
80856 + */
80857 +static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
80858 +
80859 +static inline pte_t pte_rdprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
80860 +static inline pte_t pte_exprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
80861 +static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
80862 +static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
80863 +static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
80864 +static inline pte_t pte_mkread(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
80865 +static inline pte_t pte_mkexec(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
80866 +static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
80867 +static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
80868 +static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
80869 +static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= _PAGE_PSE; return pte; }
80870 +
80871 +#ifdef CONFIG_X86_PAE
80872 +# include <asm/pgtable-3level.h>
80873 +#else
80874 +# include <asm/pgtable-2level.h>
80875 +#endif
80876 +
80877 +/*
80878 + * Rules for using pte_update - it must be called after any PTE update which
80879 + * has not been done using the set_pte / clear_pte interfaces.  It is used by
80880 + * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
80881 + * updates should either be sets, clears, or set_pte_atomic for P->P
80882 + * transitions, which means this hook should only be called for user PTEs.
80883 + * This hook implies a P->P protection or access change has taken place, which
80884 + * requires a subsequent TLB flush.  The notification can optionally be delayed
80885 + * until the TLB flush event by using the pte_update_defer form of the
80886 + * interface, but care must be taken to assure that the flush happens while
80887 + * still holding the same page table lock so that the shadow and primary pages
80888 + * do not become out of sync on SMP.
80889 + */
80890 +#define pte_update(mm, addr, ptep)             do { } while (0)
80891 +#define pte_update_defer(mm, addr, ptep)       do { } while (0)
80892 +
80893 +
80894 +/*
80895 + * We only update the dirty/accessed state if we set
80896 + * the dirty bit by hand in the kernel, since the hardware
80897 + * will do the accessed bit for us, and we don't want to
80898 + * race with other CPU's that might be updating the dirty
80899 + * bit at the same time.
80900 + */
80901 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
80902 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
80903 +       do {                                                              \
80904 +               if (__dirty) {                                            \
80905 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
80906 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
80907 +                           pte_update_defer((__vma)->vm_mm, (__address), (__ptep)); \
80908 +                       } else {                                          \
80909 +                            xen_l1_entry_update((__ptep), (__entry)); \
80910 +                           pte_update_defer((__vma)->vm_mm, (__address), (__ptep)); \
80911 +                           flush_tlb_page((__vma), (__address));         \
80912 +                       }                                                 \
80913 +               }                                                         \
80914 +       } while (0)
80915 +
80916 +/*
80917 + * We don't actually have these, but we want to advertise them so that
80918 + * we can encompass the flush here.
80919 + */
80920 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
80921 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
80922 +
80923 +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
80924 +#define ptep_clear_flush_dirty(vma, address, ptep)                     \
80925 +({                                                                     \
80926 +       int __dirty;                                                    \
80927 +       __dirty = pte_dirty(*(ptep));                                   \
80928 +       if (__dirty) {                                                  \
80929 +               clear_bit(_PAGE_BIT_DIRTY, &(ptep)->pte_low);           \
80930 +               pte_update_defer((vma)->vm_mm, (addr), (ptep));         \
80931 +               flush_tlb_page(vma, address);                           \
80932 +       }                                                               \
80933 +       __dirty;                                                        \
80934 +})
80935 +
80936 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
80937 +#define ptep_clear_flush_young(vma, address, ptep)                     \
80938 +({                                                                     \
80939 +       int __young;                                                    \
80940 +       __young = pte_young(*(ptep));                                   \
80941 +       if (__young) {                                                  \
80942 +               clear_bit(_PAGE_BIT_ACCESSED, &(ptep)->pte_low);        \
80943 +               pte_update_defer((vma)->vm_mm, (addr), (ptep));         \
80944 +               flush_tlb_page(vma, address);                           \
80945 +       }                                                               \
80946 +       __young;                                                        \
80947 +})
80948 +
80949 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
80950 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
80951 +{
80952 +       pte_t pte;
80953 +       if (full) {
80954 +               pte = *ptep;
80955 +               pte_clear(mm, addr, ptep);
80956 +       } else {
80957 +               pte = ptep_get_and_clear(mm, addr, ptep);
80958 +       }
80959 +       return pte;
80960 +}
80961 +
80962 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
80963 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
80964 +{
80965 +       if (pte_write(*ptep))
80966 +               clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
80967 +       pte_update(mm, addr, ptep);
80968 +}
80969 +
80970 +/*
80971 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
80972 + *
80973 + *  dst - pointer to pgd range anwhere on a pgd page
80974 + *  src - ""
80975 + *  count - the number of pgds to copy.
80976 + *
80977 + * dst and src can be on the same page, but the range must not overlap,
80978 + * and must not cross a page boundary.
80979 + */
80980 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
80981 +{
80982 +       memcpy(dst, src, count * sizeof(pgd_t));
80983 +}
80984 +
80985 +/*
80986 + * Macro to mark a page protection value as "uncacheable".  On processors which do not support
80987 + * it, this is a no-op.
80988 + */
80989 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3)                                          \
80990 +                                ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
80991 +
80992 +/*
80993 + * Conversion functions: convert a page and protection to a page entry,
80994 + * and a page entry and page directory to the page they refer to.
80995 + */
80996 +
80997 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
80998 +
80999 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
81000 +{
81001 +       pte.pte_low &= _PAGE_CHG_MASK;
81002 +       pte.pte_low |= pgprot_val(newprot);
81003 +#ifdef CONFIG_X86_PAE
81004 +       /*
81005 +        * Chop off the NX bit (if present), and add the NX portion of
81006 +        * the newprot (if present):
81007 +        */
81008 +       pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
81009 +       pte.pte_high |= (pgprot_val(newprot) >> 32) & \
81010 +                                       (__supported_pte_mask >> 32);
81011 +#endif
81012 +       return pte;
81013 +}
81014 +
81015 +#define pmd_large(pmd) \
81016 +((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
81017 +
81018 +/*
81019 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
81020 + *
81021 + * this macro returns the index of the entry in the pgd page which would
81022 + * control the given virtual address
81023 + */
81024 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
81025 +#define pgd_index_k(addr) pgd_index(addr)
81026 +
81027 +/*
81028 + * pgd_offset() returns a (pgd_t *)
81029 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
81030 + */
81031 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
81032 +
81033 +/*
81034 + * a shortcut which implies the use of the kernel's pgd, instead
81035 + * of a process's
81036 + */
81037 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
81038 +
81039 +/*
81040 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
81041 + *
81042 + * this macro returns the index of the entry in the pmd page which would
81043 + * control the given virtual address
81044 + */
81045 +#define pmd_index(address) \
81046 +               (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
81047 +
81048 +/*
81049 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
81050 + *
81051 + * this macro returns the index of the entry in the pte page which would
81052 + * control the given virtual address
81053 + */
81054 +#define pte_index(address) \
81055 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
81056 +#define pte_offset_kernel(dir, address) \
81057 +       ((pte_t *) pmd_page_vaddr(*(dir)) +  pte_index(address))
81058 +
81059 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
81060 +
81061 +#define pmd_page_vaddr(pmd) \
81062 +               ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
81063 +
81064 +/*
81065 + * Helper function that returns the kernel pagetable entry controlling
81066 + * the virtual address 'address'. NULL means no pagetable entry present.
81067 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
81068 + * as a pte too.
81069 + */
81070 +extern pte_t *lookup_address(unsigned long address);
81071 +
81072 +/*
81073 + * Make a given kernel text page executable/non-executable.
81074 + * Returns the previous executability setting of that page (which
81075 + * is used to restore the previous state). Used by the SMP bootup code.
81076 + * NOTE: this is an __init function for security reasons.
81077 + */
81078 +#ifdef CONFIG_X86_PAE
81079 + extern int set_kernel_exec(unsigned long vaddr, int enable);
81080 +#else
81081 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
81082 +#endif
81083 +
81084 +#if defined(CONFIG_HIGHPTE)
81085 +#define pte_offset_map(dir, address) \
81086 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
81087 +        pte_index(address))
81088 +#define pte_offset_map_nested(dir, address) \
81089 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
81090 +        pte_index(address))
81091 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
81092 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
81093 +#else
81094 +#define pte_offset_map(dir, address) \
81095 +       ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
81096 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
81097 +#define pte_unmap(pte) do { } while (0)
81098 +#define pte_unmap_nested(pte) do { } while (0)
81099 +#endif
81100 +
81101 +/* Clear a kernel PTE and flush it from the TLB */
81102 +#define kpte_clear_flush(ptep, vaddr)                                  \
81103 +do {                                                                   \
81104 +       pte_clear(&init_mm, vaddr, ptep);                               \
81105 +       __flush_tlb_one(vaddr);                                         \
81106 +} while (0)
81107 +
81108 +/*
81109 + * The i386 doesn't have any external MMU info: the kernel page
81110 + * tables contain all the necessary information.
81111 + */
81112 +#define update_mmu_cache(vma,address,pte) do { } while (0)
81113 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
81114 +
81115 +#define __HAVE_ARCH_PTEP_ESTABLISH
81116 +#define ptep_establish(__vma, __address, __ptep, __entry)              \
81117 +do {                                                                   \
81118 +       ptep_set_access_flags(__vma, __address, __ptep, __entry, 1);    \
81119 +} while (0)
81120 +
81121 +#include <xen/features.h>
81122 +void make_lowmem_page_readonly(void *va, unsigned int feature);
81123 +void make_lowmem_page_writable(void *va, unsigned int feature);
81124 +void make_page_readonly(void *va, unsigned int feature);
81125 +void make_page_writable(void *va, unsigned int feature);
81126 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
81127 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
81128 +
81129 +#define virt_to_ptep(__va)                                             \
81130 +({                                                                     \
81131 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
81132 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
81133 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
81134 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
81135 +})
81136 +
81137 +#define arbitrary_virt_to_machine(__va)                                        \
81138 +({                                                                     \
81139 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
81140 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
81141 +})
81142 +
81143 +#endif /* !__ASSEMBLY__ */
81144 +
81145 +#ifdef CONFIG_FLATMEM
81146 +#define kern_addr_valid(addr)  (1)
81147 +#endif /* CONFIG_FLATMEM */
81148 +
81149 +int direct_remap_pfn_range(struct vm_area_struct *vma,
81150 +                           unsigned long address, 
81151 +                           unsigned long mfn,
81152 +                           unsigned long size, 
81153 +                           pgprot_t prot,
81154 +                           domid_t  domid);
81155 +int direct_kernel_remap_pfn_range(unsigned long address, 
81156 +                                 unsigned long mfn,
81157 +                                 unsigned long size, 
81158 +                                 pgprot_t prot,
81159 +                                 domid_t  domid);
81160 +int create_lookup_pte_addr(struct mm_struct *mm,
81161 +                           unsigned long address,
81162 +                           uint64_t *ptep);
81163 +int touch_pte_range(struct mm_struct *mm,
81164 +                    unsigned long address,
81165 +                    unsigned long size);
81166 +
81167 +#define io_remap_pfn_range(vma,vaddr,pfn,size,prot) \
81168 +       direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
81169 +
81170 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
81171 +#define GET_IOSPACE(pfn)               0
81172 +#define GET_PFN(pfn)                   (pfn)
81173 +
81174 +#include <asm-generic/pgtable.h>
81175 +
81176 +#endif /* _I386_PGTABLE_H */
81177 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/processor.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/processor.h
81178 --- linux-2.6.19/include/asm-i386/mach-xen/asm/processor.h      1970-01-01 00:00:00.000000000 +0000
81179 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/processor.h    2007-02-02 19:10:55.000000000 +0000
81180 @@ -0,0 +1,737 @@
81181 +/*
81182 + * include/asm-i386/processor.h
81183 + *
81184 + * Copyright (C) 1994 Linus Torvalds
81185 + */
81186 +
81187 +#ifndef __ASM_I386_PROCESSOR_H
81188 +#define __ASM_I386_PROCESSOR_H
81189 +
81190 +#include <asm/vm86.h>
81191 +#include <asm/math_emu.h>
81192 +#include <asm/segment.h>
81193 +#include <asm/page.h>
81194 +#include <asm/types.h>
81195 +#include <asm/sigcontext.h>
81196 +#include <asm/cpufeature.h>
81197 +#include <asm/msr.h>
81198 +#include <asm/system.h>
81199 +#include <linux/cache.h>
81200 +#include <linux/threads.h>
81201 +#include <asm/percpu.h>
81202 +#include <linux/cpumask.h>
81203 +#include <xen/interface/physdev.h>
81204 +
81205 +/* flag for disabling the tsc */
81206 +extern int tsc_disable;
81207 +
81208 +struct desc_struct {
81209 +       unsigned long a,b;
81210 +};
81211 +
81212 +#define desc_empty(desc) \
81213 +               (!((desc)->a | (desc)->b))
81214 +
81215 +#define desc_equal(desc1, desc2) \
81216 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
81217 +/*
81218 + * Default implementation of macro that returns current
81219 + * instruction pointer ("program counter").
81220 + */
81221 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
81222 +
81223 +/*
81224 + *  CPU type and hardware bug flags. Kept separately for each CPU.
81225 + *  Members of this structure are referenced in head.S, so think twice
81226 + *  before touching them. [mj]
81227 + */
81228 +
81229 +struct cpuinfo_x86 {
81230 +       __u8    x86;            /* CPU family */
81231 +       __u8    x86_vendor;     /* CPU vendor */
81232 +       __u8    x86_model;
81233 +       __u8    x86_mask;
81234 +       char    wp_works_ok;    /* It doesn't on 386's */
81235 +       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
81236 +       char    hard_math;
81237 +       char    rfu;
81238 +               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
81239 +       unsigned long   x86_capability[NCAPINTS];
81240 +       char    x86_vendor_id[16];
81241 +       char    x86_model_id[64];
81242 +       int     x86_cache_size;  /* in KB - valid for CPUS which support this
81243 +                                   call  */
81244 +       int     x86_cache_alignment;    /* In bytes */
81245 +       char    fdiv_bug;
81246 +       char    f00f_bug;
81247 +       char    coma_bug;
81248 +       char    pad0;
81249 +       int     x86_power;
81250 +       unsigned long loops_per_jiffy;
81251 +#ifdef CONFIG_SMP
81252 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
81253 +#endif
81254 +       unsigned char x86_max_cores;    /* cpuid returned max cores value */
81255 +       unsigned char apicid;
81256 +#ifdef CONFIG_SMP
81257 +       unsigned char booted_cores;     /* number of cores as seen by OS */
81258 +       __u8 phys_proc_id;              /* Physical processor id. */
81259 +       __u8 cpu_core_id;               /* Core id */
81260 +#endif
81261 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
81262 +
81263 +#define X86_VENDOR_INTEL 0
81264 +#define X86_VENDOR_CYRIX 1
81265 +#define X86_VENDOR_AMD 2
81266 +#define X86_VENDOR_UMC 3
81267 +#define X86_VENDOR_NEXGEN 4
81268 +#define X86_VENDOR_CENTAUR 5
81269 +#define X86_VENDOR_RISE 6
81270 +#define X86_VENDOR_TRANSMETA 7
81271 +#define X86_VENDOR_NSC 8
81272 +#define X86_VENDOR_NUM 9
81273 +#define X86_VENDOR_UNKNOWN 0xff
81274 +
81275 +/*
81276 + * capabilities of CPUs
81277 + */
81278 +
81279 +extern struct cpuinfo_x86 boot_cpu_data;
81280 +extern struct cpuinfo_x86 new_cpu_data;
81281 +#ifndef CONFIG_X86_NO_TSS
81282 +extern struct tss_struct doublefault_tss;
81283 +DECLARE_PER_CPU(struct tss_struct, init_tss);
81284 +#endif
81285 +
81286 +#ifdef CONFIG_SMP
81287 +extern struct cpuinfo_x86 cpu_data[];
81288 +#define current_cpu_data cpu_data[smp_processor_id()]
81289 +#else
81290 +#define cpu_data (&boot_cpu_data)
81291 +#define current_cpu_data boot_cpu_data
81292 +#endif
81293 +
81294 +extern int cpu_llc_id[NR_CPUS];
81295 +extern char ignore_fpu_irq;
81296 +
81297 +extern void identify_cpu(struct cpuinfo_x86 *);
81298 +extern void print_cpu_info(struct cpuinfo_x86 *);
81299 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
81300 +extern unsigned short num_cache_leaves;
81301 +
81302 +#ifdef CONFIG_X86_HT
81303 +extern void detect_ht(struct cpuinfo_x86 *c);
81304 +#else
81305 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
81306 +#endif
81307 +
81308 +/*
81309 + * EFLAGS bits
81310 + */
81311 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
81312 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
81313 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
81314 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
81315 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
81316 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
81317 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
81318 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
81319 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
81320 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
81321 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
81322 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
81323 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
81324 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
81325 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
81326 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
81327 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
81328 +
81329 +static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
81330 +                          unsigned int *ecx, unsigned int *edx)
81331 +{
81332 +       /* ecx is often an input as well as an output. */
81333 +       __asm__(XEN_CPUID
81334 +               : "=a" (*eax),
81335 +                 "=b" (*ebx),
81336 +                 "=c" (*ecx),
81337 +                 "=d" (*edx)
81338 +               : "0" (*eax), "2" (*ecx));
81339 +}
81340 +
81341 +/*
81342 + * Generic CPUID function
81343 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
81344 + * resulting in stale register contents being returned.
81345 + */
81346 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
81347 +{
81348 +       *eax = op;
81349 +       *ecx = 0;
81350 +       __cpuid(eax, ebx, ecx, edx);
81351 +}
81352 +
81353 +/* Some CPUID calls want 'count' to be placed in ecx */
81354 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
81355 +                              int *edx)
81356 +{
81357 +       *eax = op;
81358 +       *ecx = count;
81359 +       __cpuid(eax, ebx, ecx, edx);
81360 +}
81361 +
81362 +/*
81363 + * CPUID functions returning a single datum
81364 + */
81365 +static inline unsigned int cpuid_eax(unsigned int op)
81366 +{
81367 +       unsigned int eax, ebx, ecx, edx;
81368 +
81369 +       cpuid(op, &eax, &ebx, &ecx, &edx);
81370 +       return eax;
81371 +}
81372 +static inline unsigned int cpuid_ebx(unsigned int op)
81373 +{
81374 +       unsigned int eax, ebx, ecx, edx;
81375 +
81376 +       cpuid(op, &eax, &ebx, &ecx, &edx);
81377 +       return ebx;
81378 +}
81379 +static inline unsigned int cpuid_ecx(unsigned int op)
81380 +{
81381 +       unsigned int eax, ebx, ecx, edx;
81382 +
81383 +       cpuid(op, &eax, &ebx, &ecx, &edx);
81384 +       return ecx;
81385 +}
81386 +static inline unsigned int cpuid_edx(unsigned int op)
81387 +{
81388 +       unsigned int eax, ebx, ecx, edx;
81389 +
81390 +       cpuid(op, &eax, &ebx, &ecx, &edx);
81391 +       return edx;
81392 +}
81393 +
81394 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
81395 +
81396 +/*
81397 + * Intel CPU features in CR4
81398 + */
81399 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
81400 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
81401 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
81402 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
81403 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
81404 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
81405 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
81406 +#define X86_CR4_PGE            0x0080  /* enable global pages */
81407 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
81408 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
81409 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
81410 +
81411 +/*
81412 + * Save the cr4 feature set we're using (ie
81413 + * Pentium 4MB enable and PPro Global page
81414 + * enable), so that any CPU's that boot up
81415 + * after us can get the correct flags.
81416 + */
81417 +extern unsigned long mmu_cr4_features;
81418 +
81419 +static inline void set_in_cr4 (unsigned long mask)
81420 +{
81421 +       unsigned cr4;
81422 +       mmu_cr4_features |= mask;
81423 +       cr4 = read_cr4();
81424 +       cr4 |= mask;
81425 +       write_cr4(cr4);
81426 +}
81427 +
81428 +static inline void clear_in_cr4 (unsigned long mask)
81429 +{
81430 +       unsigned cr4;
81431 +       mmu_cr4_features &= ~mask;
81432 +       cr4 = read_cr4();
81433 +       cr4 &= ~mask;
81434 +       write_cr4(cr4);
81435 +}
81436 +
81437 +/*
81438 + *      NSC/Cyrix CPU configuration register indexes
81439 + */
81440 +
81441 +#define CX86_PCR0 0x20
81442 +#define CX86_GCR  0xb8
81443 +#define CX86_CCR0 0xc0
81444 +#define CX86_CCR1 0xc1
81445 +#define CX86_CCR2 0xc2
81446 +#define CX86_CCR3 0xc3
81447 +#define CX86_CCR4 0xe8
81448 +#define CX86_CCR5 0xe9
81449 +#define CX86_CCR6 0xea
81450 +#define CX86_CCR7 0xeb
81451 +#define CX86_PCR1 0xf0
81452 +#define CX86_DIR0 0xfe
81453 +#define CX86_DIR1 0xff
81454 +#define CX86_ARR_BASE 0xc4
81455 +#define CX86_RCR_BASE 0xdc
81456 +
81457 +/*
81458 + *      NSC/Cyrix CPU indexed register access macros
81459 + */
81460 +
81461 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
81462 +
81463 +#define setCx86(reg, data) do { \
81464 +       outb((reg), 0x22); \
81465 +       outb((data), 0x23); \
81466 +} while (0)
81467 +
81468 +/* Stop speculative execution */
81469 +static inline void sync_core(void)
81470 +{
81471 +       int tmp;
81472 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
81473 +}
81474 +
81475 +static inline void __monitor(const void *eax, unsigned long ecx,
81476 +               unsigned long edx)
81477 +{
81478 +       /* "monitor %eax,%ecx,%edx;" */
81479 +       asm volatile(
81480 +               ".byte 0x0f,0x01,0xc8;"
81481 +               : :"a" (eax), "c" (ecx), "d"(edx));
81482 +}
81483 +
81484 +static inline void __mwait(unsigned long eax, unsigned long ecx)
81485 +{
81486 +       /* "mwait %eax,%ecx;" */
81487 +       asm volatile(
81488 +               ".byte 0x0f,0x01,0xc9;"
81489 +               : :"a" (eax), "c" (ecx));
81490 +}
81491 +
81492 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
81493 +
81494 +/* from system description table in BIOS.  Mostly for MCA use, but
81495 +others may find it useful. */
81496 +extern unsigned int machine_id;
81497 +extern unsigned int machine_submodel_id;
81498 +extern unsigned int BIOS_revision;
81499 +extern unsigned int mca_pentium_flag;
81500 +
81501 +/* Boot loader type from the setup header */
81502 +extern int bootloader_type;
81503 +
81504 +/*
81505 + * User space process size: 3GB (default).
81506 + */
81507 +#define TASK_SIZE      (PAGE_OFFSET)
81508 +
81509 +/* This decides where the kernel will search for a free chunk of vm
81510 + * space during mmap's.
81511 + */
81512 +#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
81513 +
81514 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
81515 +
81516 +/*
81517 + * Size of io_bitmap.
81518 + */
81519 +#define IO_BITMAP_BITS  65536
81520 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
81521 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
81522 +#ifndef CONFIG_X86_NO_TSS
81523 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
81524 +#endif
81525 +#define INVALID_IO_BITMAP_OFFSET 0x8000
81526 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
81527 +
81528 +struct i387_fsave_struct {
81529 +       long    cwd;
81530 +       long    swd;
81531 +       long    twd;
81532 +       long    fip;
81533 +       long    fcs;
81534 +       long    foo;
81535 +       long    fos;
81536 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
81537 +       long    status;         /* software status information */
81538 +};
81539 +
81540 +struct i387_fxsave_struct {
81541 +       unsigned short  cwd;
81542 +       unsigned short  swd;
81543 +       unsigned short  twd;
81544 +       unsigned short  fop;
81545 +       long    fip;
81546 +       long    fcs;
81547 +       long    foo;
81548 +       long    fos;
81549 +       long    mxcsr;
81550 +       long    mxcsr_mask;
81551 +       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
81552 +       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
81553 +       long    padding[56];
81554 +} __attribute__ ((aligned (16)));
81555 +
81556 +struct i387_soft_struct {
81557 +       long    cwd;
81558 +       long    swd;
81559 +       long    twd;
81560 +       long    fip;
81561 +       long    fcs;
81562 +       long    foo;
81563 +       long    fos;
81564 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
81565 +       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
81566 +       struct info     *info;
81567 +       unsigned long   entry_eip;
81568 +};
81569 +
81570 +union i387_union {
81571 +       struct i387_fsave_struct        fsave;
81572 +       struct i387_fxsave_struct       fxsave;
81573 +       struct i387_soft_struct soft;
81574 +};
81575 +
81576 +typedef struct {
81577 +       unsigned long seg;
81578 +} mm_segment_t;
81579 +
81580 +struct thread_struct;
81581 +
81582 +#ifndef CONFIG_X86_NO_TSS
81583 +struct tss_struct {
81584 +       unsigned short  back_link,__blh;
81585 +       unsigned long   esp0;
81586 +       unsigned short  ss0,__ss0h;
81587 +       unsigned long   esp1;
81588 +       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
81589 +       unsigned long   esp2;
81590 +       unsigned short  ss2,__ss2h;
81591 +       unsigned long   __cr3;
81592 +       unsigned long   eip;
81593 +       unsigned long   eflags;
81594 +       unsigned long   eax,ecx,edx,ebx;
81595 +       unsigned long   esp;
81596 +       unsigned long   ebp;
81597 +       unsigned long   esi;
81598 +       unsigned long   edi;
81599 +       unsigned short  es, __esh;
81600 +       unsigned short  cs, __csh;
81601 +       unsigned short  ss, __ssh;
81602 +       unsigned short  ds, __dsh;
81603 +       unsigned short  fs, __fsh;
81604 +       unsigned short  gs, __gsh;
81605 +       unsigned short  ldt, __ldth;
81606 +       unsigned short  trace, io_bitmap_base;
81607 +       /*
81608 +        * The extra 1 is there because the CPU will access an
81609 +        * additional byte beyond the end of the IO permission
81610 +        * bitmap. The extra byte must be all 1 bits, and must
81611 +        * be within the limit.
81612 +        */
81613 +       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
81614 +       /*
81615 +        * Cache the current maximum and the last task that used the bitmap:
81616 +        */
81617 +       unsigned long io_bitmap_max;
81618 +       struct thread_struct *io_bitmap_owner;
81619 +       /*
81620 +        * pads the TSS to be cacheline-aligned (size is 0x100)
81621 +        */
81622 +       unsigned long __cacheline_filler[35];
81623 +       /*
81624 +        * .. and then another 0x100 bytes for emergency kernel stack
81625 +        */
81626 +       unsigned long stack[64];
81627 +} __attribute__((packed));
81628 +#endif
81629 +
81630 +#define ARCH_MIN_TASKALIGN     16
81631 +
81632 +struct thread_struct {
81633 +/* cached TLS descriptors. */
81634 +       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
81635 +       unsigned long   esp0;
81636 +       unsigned long   sysenter_cs;
81637 +       unsigned long   eip;
81638 +       unsigned long   esp;
81639 +       unsigned long   fs;
81640 +       unsigned long   gs;
81641 +/* Hardware debugging registers */
81642 +       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
81643 +/* fault info */
81644 +       unsigned long   cr2, trap_no, error_code;
81645 +/* floating point info */
81646 +       union i387_union        i387;
81647 +/* virtual 86 mode info */
81648 +       struct vm86_struct __user * vm86_info;
81649 +       unsigned long           screen_bitmap;
81650 +       unsigned long           v86flags, v86mask, saved_esp0;
81651 +       unsigned int            saved_fs, saved_gs;
81652 +/* IO permissions */
81653 +       unsigned long   *io_bitmap_ptr;
81654 +       unsigned long   iopl;
81655 +/* max allowed port in the bitmap, in bytes: */
81656 +       unsigned long   io_bitmap_max;
81657 +};
81658 +
81659 +#define INIT_THREAD  {                                                 \
81660 +       .vm86_info = NULL,                                              \
81661 +       .sysenter_cs = __KERNEL_CS,                                     \
81662 +       .io_bitmap_ptr = NULL,                                          \
81663 +}
81664 +
81665 +#ifndef CONFIG_X86_NO_TSS
81666 +/*
81667 + * Note that the .io_bitmap member must be extra-big. This is because
81668 + * the CPU will access an additional byte beyond the end of the IO
81669 + * permission bitmap. The extra byte must be all 1 bits, and must
81670 + * be within the limit.
81671 + */
81672 +#define INIT_TSS  {                                                    \
81673 +       .esp0           = sizeof(init_stack) + (long)&init_stack,       \
81674 +       .ss0            = __KERNEL_DS,                                  \
81675 +       .ss1            = __KERNEL_CS,                                  \
81676 +       .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,                     \
81677 +       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
81678 +}
81679 +
81680 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
81681 +{
81682 +       tss->esp0 = thread->esp0;
81683 +       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
81684 +       if (unlikely(tss->ss1 != thread->sysenter_cs)) {
81685 +               tss->ss1 = thread->sysenter_cs;
81686 +               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
81687 +       }
81688 +}
81689 +#define load_esp0(tss, thread) \
81690 +       __load_esp0(tss, thread)
81691 +#else
81692 +#define load_esp0(tss, thread) \
81693 +       HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)
81694 +#endif
81695 +
81696 +#define start_thread(regs, new_eip, new_esp) do {              \
81697 +       __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0));       \
81698 +       set_fs(USER_DS);                                        \
81699 +       regs->xds = __USER_DS;                                  \
81700 +       regs->xes = __USER_DS;                                  \
81701 +       regs->xss = __USER_DS;                                  \
81702 +       regs->xcs = __USER_CS;                                  \
81703 +       regs->eip = new_eip;                                    \
81704 +       regs->esp = new_esp;                                    \
81705 +} while (0)
81706 +
81707 +/*
81708 + * These special macros can be used to get or set a debugging register
81709 + */
81710 +#define get_debugreg(var, register)                            \
81711 +               (var) = HYPERVISOR_get_debugreg((register))
81712 +#define set_debugreg(value, register)                  \
81713 +               HYPERVISOR_set_debugreg((register), (value))
81714 +
81715 +/*
81716 + * Set IOPL bits in EFLAGS from given mask
81717 + */
81718 +static inline void set_iopl_mask(unsigned mask)
81719 +{
81720 +       struct physdev_set_iopl set_iopl;
81721 +
81722 +       /* Force the change at ring 0. */
81723 +       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
81724 +       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
81725 +}
81726 +
81727 +/* Forward declaration, a strange C thing */
81728 +struct task_struct;
81729 +struct mm_struct;
81730 +
81731 +/* Free all resources held by a thread. */
81732 +extern void release_thread(struct task_struct *);
81733 +
81734 +/* Prepare to copy thread state - unlazy all lazy status */
81735 +extern void prepare_to_copy(struct task_struct *tsk);
81736 +
81737 +/*
81738 + * create a kernel thread without removing it from tasklists
81739 + */
81740 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
81741 +
81742 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
81743 +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
81744 +
81745 +unsigned long get_wchan(struct task_struct *p);
81746 +
81747 +#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
81748 +#define KSTK_TOP(info)                                                 \
81749 +({                                                                     \
81750 +       unsigned long *__ptr = (unsigned long *)(info);                 \
81751 +       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
81752 +})
81753 +
81754 +/*
81755 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
81756 + * This is necessary to guarantee that the entire "struct pt_regs"
81757 + * is accessable even if the CPU haven't stored the SS/ESP registers
81758 + * on the stack (interrupt gate does not save these registers
81759 + * when switching to the same priv ring).
81760 + * Therefore beware: accessing the xss/esp fields of the
81761 + * "struct pt_regs" is possible, but they may contain the
81762 + * completely wrong values.
81763 + */
81764 +#define task_pt_regs(task)                                             \
81765 +({                                                                     \
81766 +       struct pt_regs *__regs__;                                       \
81767 +       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
81768 +       __regs__ - 1;                                                   \
81769 +})
81770 +
81771 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
81772 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
81773 +
81774 +
81775 +struct microcode_header {
81776 +       unsigned int hdrver;
81777 +       unsigned int rev;
81778 +       unsigned int date;
81779 +       unsigned int sig;
81780 +       unsigned int cksum;
81781 +       unsigned int ldrver;
81782 +       unsigned int pf;
81783 +       unsigned int datasize;
81784 +       unsigned int totalsize;
81785 +       unsigned int reserved[3];
81786 +};
81787 +
81788 +struct microcode {
81789 +       struct microcode_header hdr;
81790 +       unsigned int bits[0];
81791 +};
81792 +
81793 +typedef struct microcode microcode_t;
81794 +typedef struct microcode_header microcode_header_t;
81795 +
81796 +/* microcode format is extended from prescott processors */
81797 +struct extended_signature {
81798 +       unsigned int sig;
81799 +       unsigned int pf;
81800 +       unsigned int cksum;
81801 +};
81802 +
81803 +struct extended_sigtable {
81804 +       unsigned int count;
81805 +       unsigned int cksum;
81806 +       unsigned int reserved[3];
81807 +       struct extended_signature sigs[0];
81808 +};
81809 +
81810 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
81811 +static inline void rep_nop(void)
81812 +{
81813 +       __asm__ __volatile__("rep;nop": : :"memory");
81814 +}
81815 +
81816 +#define cpu_relax()    rep_nop()
81817 +
81818 +/* generic versions from gas */
81819 +#define GENERIC_NOP1   ".byte 0x90\n"
81820 +#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
81821 +#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
81822 +#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
81823 +#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
81824 +#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
81825 +#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
81826 +#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
81827 +
81828 +/* Opteron nops */
81829 +#define K8_NOP1 GENERIC_NOP1
81830 +#define K8_NOP2        ".byte 0x66,0x90\n" 
81831 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
81832 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
81833 +#define K8_NOP5        K8_NOP3 K8_NOP2 
81834 +#define K8_NOP6        K8_NOP3 K8_NOP3
81835 +#define K8_NOP7        K8_NOP4 K8_NOP3
81836 +#define K8_NOP8        K8_NOP4 K8_NOP4
81837 +
81838 +/* K7 nops */
81839 +/* uses eax dependencies (arbitary choice) */
81840 +#define K7_NOP1  GENERIC_NOP1
81841 +#define K7_NOP2        ".byte 0x8b,0xc0\n" 
81842 +#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
81843 +#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
81844 +#define K7_NOP5        K7_NOP4 ASM_NOP1
81845 +#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
81846 +#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
81847 +#define K7_NOP8        K7_NOP7 ASM_NOP1
81848 +
81849 +#ifdef CONFIG_MK8
81850 +#define ASM_NOP1 K8_NOP1
81851 +#define ASM_NOP2 K8_NOP2
81852 +#define ASM_NOP3 K8_NOP3
81853 +#define ASM_NOP4 K8_NOP4
81854 +#define ASM_NOP5 K8_NOP5
81855 +#define ASM_NOP6 K8_NOP6
81856 +#define ASM_NOP7 K8_NOP7
81857 +#define ASM_NOP8 K8_NOP8
81858 +#elif defined(CONFIG_MK7)
81859 +#define ASM_NOP1 K7_NOP1
81860 +#define ASM_NOP2 K7_NOP2
81861 +#define ASM_NOP3 K7_NOP3
81862 +#define ASM_NOP4 K7_NOP4
81863 +#define ASM_NOP5 K7_NOP5
81864 +#define ASM_NOP6 K7_NOP6
81865 +#define ASM_NOP7 K7_NOP7
81866 +#define ASM_NOP8 K7_NOP8
81867 +#else
81868 +#define ASM_NOP1 GENERIC_NOP1
81869 +#define ASM_NOP2 GENERIC_NOP2
81870 +#define ASM_NOP3 GENERIC_NOP3
81871 +#define ASM_NOP4 GENERIC_NOP4
81872 +#define ASM_NOP5 GENERIC_NOP5
81873 +#define ASM_NOP6 GENERIC_NOP6
81874 +#define ASM_NOP7 GENERIC_NOP7
81875 +#define ASM_NOP8 GENERIC_NOP8
81876 +#endif
81877 +
81878 +#define ASM_NOP_MAX 8
81879 +
81880 +/* Prefetch instructions for Pentium III and AMD Athlon */
81881 +/* It's not worth to care about 3dnow! prefetches for the K6
81882 +   because they are microcoded there and very slow.
81883 +   However we don't do prefetches for pre XP Athlons currently
81884 +   That should be fixed. */
81885 +#define ARCH_HAS_PREFETCH
81886 +static inline void prefetch(const void *x)
81887 +{
81888 +       alternative_input(ASM_NOP4,
81889 +                         "prefetchnta (%1)",
81890 +                         X86_FEATURE_XMM,
81891 +                         "r" (x));
81892 +}
81893 +
81894 +#define ARCH_HAS_PREFETCH
81895 +#define ARCH_HAS_PREFETCHW
81896 +#define ARCH_HAS_SPINLOCK_PREFETCH
81897 +
81898 +/* 3dnow! prefetch to get an exclusive cache line. Useful for 
81899 +   spinlocks to avoid one state transition in the cache coherency protocol. */
81900 +static inline void prefetchw(const void *x)
81901 +{
81902 +       alternative_input(ASM_NOP4,
81903 +                         "prefetchw (%1)",
81904 +                         X86_FEATURE_3DNOW,
81905 +                         "r" (x));
81906 +}
81907 +#define spin_lock_prefetch(x)  prefetchw(x)
81908 +
81909 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
81910 +
81911 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
81912 +
81913 +extern unsigned long boot_option_idle_override;
81914 +extern void enable_sep_cpu(void);
81915 +extern int sysenter_setup(void);
81916 +
81917 +#endif /* __ASM_I386_PROCESSOR_H */
81918 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/scatterlist.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/scatterlist.h
81919 --- linux-2.6.19/include/asm-i386/mach-xen/asm/scatterlist.h    1970-01-01 00:00:00.000000000 +0000
81920 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/scatterlist.h  2007-02-02 19:10:55.000000000 +0000
81921 @@ -0,0 +1,22 @@
81922 +#ifndef _I386_SCATTERLIST_H
81923 +#define _I386_SCATTERLIST_H
81924 +
81925 +struct scatterlist {
81926 +    struct page                *page;
81927 +    unsigned int       offset;
81928 +    unsigned int       length;
81929 +    dma_addr_t         dma_address;
81930 +    unsigned int       dma_length;
81931 +};
81932 +
81933 +/* These macros should be used after a pci_map_sg call has been done
81934 + * to get bus addresses of each of the SG entries and their lengths.
81935 + * You should only work with the number of sg entries pci_map_sg
81936 + * returns.
81937 + */
81938 +#define sg_dma_address(sg)     ((sg)->dma_address)
81939 +#define sg_dma_len(sg)         ((sg)->dma_length)
81940 +
81941 +#define ISA_DMA_THRESHOLD (0x00ffffff)
81942 +
81943 +#endif /* !(_I386_SCATTERLIST_H) */
81944 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/segment.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/segment.h
81945 --- linux-2.6.19/include/asm-i386/mach-xen/asm/segment.h        1970-01-01 00:00:00.000000000 +0000
81946 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/segment.h      2007-02-02 19:10:55.000000000 +0000
81947 @@ -0,0 +1,132 @@
81948 +#ifndef _ASM_SEGMENT_H
81949 +#define _ASM_SEGMENT_H
81950 +
81951 +/*
81952 + * The layout of the per-CPU GDT under Linux:
81953 + *
81954 + *   0 - null
81955 + *   1 - reserved
81956 + *   2 - reserved
81957 + *   3 - reserved
81958 + *
81959 + *   4 - unused                        <==== new cacheline
81960 + *   5 - unused
81961 + *
81962 + *  ------- start of TLS (Thread-Local Storage) segments:
81963 + *
81964 + *   6 - TLS segment #1                        [ glibc's TLS segment ]
81965 + *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
81966 + *   8 - TLS segment #3
81967 + *   9 - reserved
81968 + *  10 - reserved
81969 + *  11 - reserved
81970 + *
81971 + *  ------- start of kernel segments:
81972 + *
81973 + *  12 - kernel code segment           <==== new cacheline
81974 + *  13 - kernel data segment
81975 + *  14 - default user CS
81976 + *  15 - default user DS
81977 + *  16 - TSS
81978 + *  17 - LDT
81979 + *  18 - PNPBIOS support (16->32 gate)
81980 + *  19 - PNPBIOS support
81981 + *  20 - PNPBIOS support
81982 + *  21 - PNPBIOS support
81983 + *  22 - PNPBIOS support
81984 + *  23 - APM BIOS support
81985 + *  24 - APM BIOS support
81986 + *  25 - APM BIOS support 
81987 + *
81988 + *  26 - ESPFIX small SS
81989 + *  27 - unused
81990 + *  28 - unused
81991 + *  29 - unused
81992 + *  30 - unused
81993 + *  31 - TSS for double fault handler
81994 + */
81995 +#define GDT_ENTRY_TLS_ENTRIES  3
81996 +#define GDT_ENTRY_TLS_MIN      6
81997 +#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
81998 +
81999 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
82000 +
82001 +#define GDT_ENTRY_DEFAULT_USER_CS      14
82002 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
82003 +
82004 +#define GDT_ENTRY_DEFAULT_USER_DS      15
82005 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
82006 +
82007 +#define GDT_ENTRY_KERNEL_BASE  12
82008 +
82009 +#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
82010 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
82011 +
82012 +#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
82013 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
82014 +
82015 +#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
82016 +#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
82017 +
82018 +#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
82019 +#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
82020 +
82021 +#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
82022 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
82023 +
82024 +#define GDT_ENTRY_DOUBLEFAULT_TSS      31
82025 +
82026 +/*
82027 + * The GDT has 32 entries
82028 + */
82029 +#define GDT_ENTRIES 32
82030 +
82031 +#define GDT_SIZE (GDT_ENTRIES * 8)
82032 +
82033 +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
82034 +#define SEGMENT_IS_FLAT_CODE(x)  ((x) == __USER_CS || (x) == (__KERNEL_CS | get_kernel_rpl()))
82035 +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
82036 +#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
82037 +
82038 +/* Simple and small GDT entries for booting only */
82039 +
82040 +#define GDT_ENTRY_BOOT_CS              2
82041 +#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
82042 +
82043 +#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
82044 +#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
82045 +
82046 +/* The PnP BIOS entries in the GDT */
82047 +#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
82048 +#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
82049 +#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
82050 +#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
82051 +#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
82052 +
82053 +/* The PnP BIOS selectors */
82054 +#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
82055 +#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
82056 +#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
82057 +#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
82058 +#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
82059 +
82060 +/*
82061 + * The interrupt descriptor table has room for 256 idt's,
82062 + * the global descriptor table is dependent on the number
82063 + * of tasks we can have..
82064 + */
82065 +#define IDT_ENTRIES 256
82066 +
82067 +/* Bottom two bits of selector give the ring privilege level */
82068 +#define SEGMENT_RPL_MASK       0x3
82069 +/* Bit 2 is table indicator (LDT/GDT) */
82070 +#define SEGMENT_TI_MASK                0x4
82071 +
82072 +/* User mode is privilege level 3 */
82073 +#define USER_RPL               0x3
82074 +/* LDT segment has TI set, GDT has it cleared */
82075 +#define SEGMENT_LDT            0x4
82076 +#define SEGMENT_GDT            0x0
82077 +
82078 +#define get_kernel_rpl()   (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
82079 +#endif
82080 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/setup.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/setup.h
82081 --- linux-2.6.19/include/asm-i386/mach-xen/asm/setup.h  1970-01-01 00:00:00.000000000 +0000
82082 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/setup.h        2007-02-02 19:10:55.000000000 +0000
82083 @@ -0,0 +1,81 @@
82084 +/*
82085 + *     Just a place holder. We don't want to have to test x86 before
82086 + *     we include stuff
82087 + */
82088 +
82089 +#ifndef _i386_SETUP_H
82090 +#define _i386_SETUP_H
82091 +
82092 +#ifdef __KERNEL__
82093 +#include <linux/pfn.h>
82094 +
82095 +/*
82096 + * Reserved space for vmalloc and iomap - defined in asm/page.h
82097 + */
82098 +#define MAXMEM_PFN     PFN_DOWN(MAXMEM)
82099 +#define MAX_NONPAE_PFN (1 << 20)
82100 +#endif
82101 +
82102 +#define PARAM_SIZE 4096
82103 +#define COMMAND_LINE_SIZE 256
82104 +
82105 +#define OLD_CL_MAGIC_ADDR      0x90020
82106 +#define OLD_CL_MAGIC           0xA33F
82107 +#define OLD_CL_BASE_ADDR       0x90000
82108 +#define OLD_CL_OFFSET          0x90022
82109 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
82110 +
82111 +#ifndef __ASSEMBLY__
82112 +/*
82113 + * This is set up by the setup-routine at boot-time
82114 + */
82115 +extern unsigned char boot_params[PARAM_SIZE];
82116 +
82117 +#define PARAM  (boot_params)
82118 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
82119 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
82120 +#define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0))
82121 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
82122 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
82123 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
82124 +#define IST_INFO   (*(struct ist_info *) (PARAM+0x60))
82125 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
82126 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
82127 +#define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
82128 +#define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
82129 +#define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
82130 +#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
82131 +#define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
82132 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
82133 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
82134 +#define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
82135 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
82136 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
82137 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
82138 +#define KERNEL_START (*(unsigned long *) (PARAM+0x214))
82139 +#define INITRD_START (__pa(xen_start_info->mod_start))
82140 +#define INITRD_SIZE (xen_start_info->mod_len)
82141 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
82142 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
82143 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
82144 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
82145 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
82146 +
82147 +/*
82148 + * Do NOT EVER look at the BIOS memory size location.
82149 + * It does not work on many machines.
82150 + */
82151 +#define LOWMEMSIZE()   (0x9f000)
82152 +
82153 +struct e820entry;
82154 +
82155 +char * __init machine_specific_memory_setup(void);
82156 +
82157 +int __init copy_e820_map(struct e820entry * biosmap, int nr_map);
82158 +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map);
82159 +void __init add_memory_region(unsigned long long start,
82160 +                             unsigned long long size, int type);
82161 +
82162 +#endif /* __ASSEMBLY__ */
82163 +
82164 +#endif /* _i386_SETUP_H */
82165 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/smp.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/smp.h
82166 --- linux-2.6.19/include/asm-i386/mach-xen/asm/smp.h    1970-01-01 00:00:00.000000000 +0000
82167 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/smp.h  2007-02-02 19:10:55.000000000 +0000
82168 @@ -0,0 +1,114 @@
82169 +#ifndef __ASM_SMP_H
82170 +#define __ASM_SMP_H
82171 +
82172 +/*
82173 + * We need the APIC definitions automatically as part of 'smp.h'
82174 + */
82175 +#ifndef __ASSEMBLY__
82176 +#include <linux/kernel.h>
82177 +#include <linux/threads.h>
82178 +#include <linux/cpumask.h>
82179 +#endif
82180 +
82181 +#ifdef CONFIG_X86_LOCAL_APIC
82182 +#ifndef __ASSEMBLY__
82183 +#include <asm/fixmap.h>
82184 +#include <asm/bitops.h>
82185 +#include <asm/mpspec.h>
82186 +#ifdef CONFIG_X86_IO_APIC
82187 +#include <asm/io_apic.h>
82188 +#endif
82189 +#include <asm/apic.h>
82190 +#endif
82191 +#endif
82192 +
82193 +#define BAD_APICID 0xFFu
82194 +#ifdef CONFIG_SMP
82195 +#ifndef __ASSEMBLY__
82196 +
82197 +/*
82198 + * Private routines/data
82199 + */
82200
82201 +extern void smp_alloc_memory(void);
82202 +extern int pic_mode;
82203 +extern int smp_num_siblings;
82204 +extern cpumask_t cpu_sibling_map[];
82205 +extern cpumask_t cpu_core_map[];
82206 +
82207 +extern void (*mtrr_hook) (void);
82208 +extern void zap_low_mappings (void);
82209 +extern void lock_ipi_call_lock(void);
82210 +extern void unlock_ipi_call_lock(void);
82211 +
82212 +#define MAX_APICID 256
82213 +extern u8 x86_cpu_to_apicid[];
82214 +
82215 +#define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
82216 +
82217 +#ifdef CONFIG_HOTPLUG_CPU
82218 +extern void cpu_exit_clear(void);
82219 +extern void cpu_uninit(void);
82220 +#endif
82221 +
82222 +/*
82223 + * This function is needed by all SMP systems. It must _always_ be valid
82224 + * from the initial startup. We map APIC_BASE very early in page_setup(),
82225 + * so this is correct in the x86 case.
82226 + */
82227 +#define raw_smp_processor_id() (current_thread_info()->cpu)
82228 +
82229 +extern cpumask_t cpu_possible_map;
82230 +#define cpu_callin_map cpu_possible_map
82231 +
82232 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
82233 +static inline int num_booting_cpus(void)
82234 +{
82235 +       return cpus_weight(cpu_possible_map);
82236 +}
82237 +
82238 +#ifdef CONFIG_X86_LOCAL_APIC
82239 +
82240 +#ifdef APIC_DEFINITION
82241 +extern int hard_smp_processor_id(void);
82242 +#else
82243 +#include <mach_apicdef.h>
82244 +static inline int hard_smp_processor_id(void)
82245 +{
82246 +       /* we don't want to mark this access volatile - bad code generation */
82247 +       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
82248 +}
82249 +#endif
82250 +#endif
82251 +
82252 +extern int safe_smp_processor_id(void);
82253 +extern int __cpu_disable(void);
82254 +extern void __cpu_die(unsigned int cpu);
82255 +extern unsigned int num_processors;
82256 +extern void prefill_possible_map(void);
82257 +
82258 +#endif /* !__ASSEMBLY__ */
82259 +
82260 +#else /* CONFIG_SMP */
82261 +
82262 +#define safe_smp_processor_id()                0
82263 +#define cpu_physical_id(cpu)           boot_cpu_physical_apicid
82264 +
82265 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
82266 +
82267 +#endif
82268 +
82269 +#ifndef __ASSEMBLY__
82270 +
82271 +extern u8 apicid_2_node[];
82272 +
82273 +#ifdef CONFIG_X86_LOCAL_APIC
82274 +static __inline int logical_smp_processor_id(void)
82275 +{
82276 +       /* we don't want to mark this access volatile - bad code generation */
82277 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
82278 +}
82279 +#endif
82280 +#endif
82281 +
82282 +#endif
82283 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/swiotlb.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/swiotlb.h
82284 --- linux-2.6.19/include/asm-i386/mach-xen/asm/swiotlb.h        1970-01-01 00:00:00.000000000 +0000
82285 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/swiotlb.h      2007-02-02 19:10:55.000000000 +0000
82286 @@ -0,0 +1,43 @@
82287 +#ifndef _ASM_SWIOTLB_H
82288 +#define _ASM_SWIOTLB_H 1
82289 +
82290 +/* SWIOTLB interface */
82291 +
82292 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
82293 +                                     int dir);
82294 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
82295 +                                 size_t size, int dir);
82296 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
82297 +                                        dma_addr_t dev_addr,
82298 +                                        size_t size, int dir);
82299 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
82300 +                                           dma_addr_t dev_addr,
82301 +                                           size_t size, int dir);
82302 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
82303 +                                    struct scatterlist *sg, int nelems,
82304 +                                    int dir);
82305 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
82306 +                                       struct scatterlist *sg, int nelems,
82307 +                                       int dir);
82308 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
82309 +                     int nents, int direction);
82310 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
82311 +                        int nents, int direction);
82312 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
82313 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
82314 +                                   unsigned long offset, size_t size,
82315 +                                   enum dma_data_direction direction);
82316 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
82317 +                               size_t size, enum dma_data_direction direction);
82318 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
82319 +extern void swiotlb_init(void);
82320 +
82321 +extern unsigned int dma_bits;
82322 +
82323 +#ifdef CONFIG_SWIOTLB
82324 +extern int swiotlb;
82325 +#else
82326 +#define swiotlb 0
82327 +#endif
82328 +
82329 +#endif
82330 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/synch_bitops.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/synch_bitops.h
82331 --- linux-2.6.19/include/asm-i386/mach-xen/asm/synch_bitops.h   1970-01-01 00:00:00.000000000 +0000
82332 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/synch_bitops.h 2007-02-02 19:10:55.000000000 +0000
82333 @@ -0,0 +1,145 @@
82334 +#ifndef __XEN_SYNCH_BITOPS_H__
82335 +#define __XEN_SYNCH_BITOPS_H__
82336 +
82337 +/*
82338 + * Copyright 1992, Linus Torvalds.
82339 + * Heavily modified to provide guaranteed strong synchronisation
82340 + * when communicating with Xen or other guest OSes running on other CPUs.
82341 + */
82342 +
82343 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
82344 +#include <xen/platform-compat.h>
82345 +#endif
82346 +
82347 +#define ADDR (*(volatile long *) addr)
82348 +
82349 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
82350 +{
82351 +    __asm__ __volatile__ ( 
82352 +        "lock btsl %1,%0"
82353 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
82354 +}
82355 +
82356 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
82357 +{
82358 +    __asm__ __volatile__ (
82359 +        "lock btrl %1,%0"
82360 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
82361 +}
82362 +
82363 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
82364 +{
82365 +    __asm__ __volatile__ (
82366 +        "lock btcl %1,%0"
82367 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
82368 +}
82369 +
82370 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
82371 +{
82372 +    int oldbit;
82373 +    __asm__ __volatile__ (
82374 +        "lock btsl %2,%1\n\tsbbl %0,%0"
82375 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
82376 +    return oldbit;
82377 +}
82378 +
82379 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
82380 +{
82381 +    int oldbit;
82382 +    __asm__ __volatile__ (
82383 +        "lock btrl %2,%1\n\tsbbl %0,%0"
82384 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
82385 +    return oldbit;
82386 +}
82387 +
82388 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
82389 +{
82390 +    int oldbit;
82391 +
82392 +    __asm__ __volatile__ (
82393 +        "lock btcl %2,%1\n\tsbbl %0,%0"
82394 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
82395 +    return oldbit;
82396 +}
82397 +
82398 +struct __synch_xchg_dummy { unsigned long a[100]; };
82399 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
82400 +
82401 +#define synch_cmpxchg(ptr, old, new) \
82402 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
82403 +                                     (unsigned long)(old), \
82404 +                                     (unsigned long)(new), \
82405 +                                     sizeof(*(ptr))))
82406 +
82407 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
82408 +                                           unsigned long old,
82409 +                                           unsigned long new, int size)
82410 +{
82411 +       unsigned long prev;
82412 +       switch (size) {
82413 +       case 1:
82414 +               __asm__ __volatile__("lock; cmpxchgb %b1,%2"
82415 +                                    : "=a"(prev)
82416 +                                    : "q"(new), "m"(*__synch_xg(ptr)),
82417 +                                      "0"(old)
82418 +                                    : "memory");
82419 +               return prev;
82420 +       case 2:
82421 +               __asm__ __volatile__("lock; cmpxchgw %w1,%2"
82422 +                                    : "=a"(prev)
82423 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
82424 +                                      "0"(old)
82425 +                                    : "memory");
82426 +               return prev;
82427 +#ifdef CONFIG_X86_64
82428 +       case 4:
82429 +               __asm__ __volatile__("lock; cmpxchgl %k1,%2"
82430 +                                    : "=a"(prev)
82431 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
82432 +                                      "0"(old)
82433 +                                    : "memory");
82434 +               return prev;
82435 +       case 8:
82436 +               __asm__ __volatile__("lock; cmpxchgq %1,%2"
82437 +                                    : "=a"(prev)
82438 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
82439 +                                      "0"(old)
82440 +                                    : "memory");
82441 +               return prev;
82442 +#else
82443 +       case 4:
82444 +               __asm__ __volatile__("lock; cmpxchgl %1,%2"
82445 +                                    : "=a"(prev)
82446 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
82447 +                                      "0"(old)
82448 +                                    : "memory");
82449 +               return prev;
82450 +#endif
82451 +       }
82452 +       return old;
82453 +}
82454 +
82455 +static __always_inline int synch_const_test_bit(int nr,
82456 +                                               const volatile void * addr)
82457 +{
82458 +    return ((1UL << (nr & 31)) & 
82459 +            (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
82460 +}
82461 +
82462 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
82463 +{
82464 +    int oldbit;
82465 +    __asm__ __volatile__ (
82466 +        "btl %2,%1\n\tsbbl %0,%0"
82467 +        : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
82468 +    return oldbit;
82469 +}
82470 +
82471 +#define synch_test_bit(nr,addr) \
82472 +(__builtin_constant_p(nr) ? \
82473 + synch_const_test_bit((nr),(addr)) : \
82474 + synch_var_test_bit((nr),(addr)))
82475 +
82476 +#define synch_cmpxchg_subword synch_cmpxchg
82477 +
82478 +#endif /* __XEN_SYNCH_BITOPS_H__ */
82479 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/system.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/system.h
82480 --- linux-2.6.19/include/asm-i386/mach-xen/asm/system.h 1970-01-01 00:00:00.000000000 +0000
82481 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/system.h       2007-02-02 19:10:55.000000000 +0000
82482 @@ -0,0 +1,531 @@
82483 +#ifndef __ASM_SYSTEM_H
82484 +#define __ASM_SYSTEM_H
82485 +
82486 +#include <linux/kernel.h>
82487 +#include <asm/segment.h>
82488 +#include <asm/cpufeature.h>
82489 +#include <linux/bitops.h> /* for LOCK_PREFIX */
82490 +#include <asm/synch_bitops.h>
82491 +#include <asm/hypervisor.h>
82492 +
82493 +#ifdef __KERNEL__
82494 +
82495 +#ifdef CONFIG_SMP
82496 +#define __vcpu_id smp_processor_id()
82497 +#else
82498 +#define __vcpu_id 0
82499 +#endif
82500 +
82501 +struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
82502 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
82503 +
82504 +/*
82505 + * Saving eflags is important. It switches not only IOPL between tasks,
82506 + * it also protects other tasks from NT leaking through sysenter etc.
82507 + */
82508 +#define switch_to(prev,next,last) do {                                 \
82509 +       unsigned long esi,edi;                                          \
82510 +       asm volatile("pushfl\n\t"               /* Save flags */        \
82511 +                    "pushl %%ebp\n\t"                                  \
82512 +                    "movl %%esp,%0\n\t"        /* save ESP */          \
82513 +                    "movl %5,%%esp\n\t"        /* restore ESP */       \
82514 +                    "movl $1f,%1\n\t"          /* save EIP */          \
82515 +                    "pushl %6\n\t"             /* restore EIP */       \
82516 +                    "jmp __switch_to\n"                                \
82517 +                    "1:\t"                                             \
82518 +                    "popl %%ebp\n\t"                                   \
82519 +                    "popfl"                                            \
82520 +                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
82521 +                     "=a" (last),"=S" (esi),"=D" (edi)                 \
82522 +                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
82523 +                     "2" (prev), "d" (next));                          \
82524 +} while (0)
82525 +
82526 +#define _set_base(addr,base) do { unsigned long __pr; \
82527 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
82528 +       "rorl $16,%%edx\n\t" \
82529 +       "movb %%dl,%2\n\t" \
82530 +       "movb %%dh,%3" \
82531 +       :"=&d" (__pr) \
82532 +       :"m" (*((addr)+2)), \
82533 +        "m" (*((addr)+4)), \
82534 +        "m" (*((addr)+7)), \
82535 +         "0" (base) \
82536 +        ); } while(0)
82537 +
82538 +#define _set_limit(addr,limit) do { unsigned long __lr; \
82539 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
82540 +       "rorl $16,%%edx\n\t" \
82541 +       "movb %2,%%dh\n\t" \
82542 +       "andb $0xf0,%%dh\n\t" \
82543 +       "orb %%dh,%%dl\n\t" \
82544 +       "movb %%dl,%2" \
82545 +       :"=&d" (__lr) \
82546 +       :"m" (*(addr)), \
82547 +        "m" (*((addr)+6)), \
82548 +        "0" (limit) \
82549 +        ); } while(0)
82550 +
82551 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
82552 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
82553 +
82554 +/*
82555 + * Load a segment. Fall back on loading the zero
82556 + * segment if something goes wrong..
82557 + */
82558 +#define loadsegment(seg,value)                 \
82559 +       asm volatile("\n"                       \
82560 +               "1:\t"                          \
82561 +               "mov %0,%%" #seg "\n"           \
82562 +               "2:\n"                          \
82563 +               ".section .fixup,\"ax\"\n"      \
82564 +               "3:\t"                          \
82565 +               "pushl $0\n\t"                  \
82566 +               "popl %%" #seg "\n\t"           \
82567 +               "jmp 2b\n"                      \
82568 +               ".previous\n"                   \
82569 +               ".section __ex_table,\"a\"\n\t" \
82570 +               ".align 4\n\t"                  \
82571 +               ".long 1b,3b\n"                 \
82572 +               ".previous"                     \
82573 +               : :"rm" (value))
82574 +
82575 +/*
82576 + * Save a segment register away
82577 + */
82578 +#define savesegment(seg, value) \
82579 +       asm volatile("mov %%" #seg ",%0":"=rm" (value))
82580 +
82581 +#define read_cr0() ({ \
82582 +       unsigned int __dummy; \
82583 +       __asm__ __volatile__( \
82584 +               "movl %%cr0,%0\n\t" \
82585 +               :"=r" (__dummy)); \
82586 +       __dummy; \
82587 +})
82588 +#define write_cr0(x) \
82589 +       __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
82590 +
82591 +#define read_cr2() \
82592 +       (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
82593 +#define write_cr2(x) \
82594 +       __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
82595 +
82596 +#define read_cr3() ({ \
82597 +       unsigned int __dummy; \
82598 +       __asm__ ( \
82599 +               "movl %%cr3,%0\n\t" \
82600 +               :"=r" (__dummy)); \
82601 +       __dummy = xen_cr3_to_pfn(__dummy); \
82602 +       mfn_to_pfn(__dummy) << PAGE_SHIFT; \
82603 +})
82604 +#define write_cr3(x) ({                                                \
82605 +       unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT);   \
82606 +       __dummy = xen_pfn_to_cr3(__dummy);                      \
82607 +       __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy));  \
82608 +})
82609 +
82610 +#define read_cr4() ({ \
82611 +       unsigned int __dummy; \
82612 +       __asm__( \
82613 +               "movl %%cr4,%0\n\t" \
82614 +               :"=r" (__dummy)); \
82615 +       __dummy; \
82616 +})
82617 +#define read_cr4_safe() ({                           \
82618 +       unsigned int __dummy;                         \
82619 +       /* This could fault if %cr4 does not exist */ \
82620 +       __asm__("1: movl %%cr4, %0              \n"   \
82621 +               "2:                             \n"   \
82622 +               ".section __ex_table,\"a\"      \n"   \
82623 +               ".long 1b,2b                    \n"   \
82624 +               ".previous                      \n"   \
82625 +               : "=r" (__dummy): "0" (0));           \
82626 +       __dummy;                                      \
82627 +})
82628 +#define write_cr4(x) \
82629 +       __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
82630 +
82631 +/*
82632 + * Clear and set 'TS' bit respectively
82633 + */
82634 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
82635 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
82636 +
82637 +#endif /* __KERNEL__ */
82638 +
82639 +#define wbinvd() \
82640 +       __asm__ __volatile__ ("wbinvd": : :"memory")
82641 +
82642 +static inline unsigned long get_limit(unsigned long segment)
82643 +{
82644 +       unsigned long __limit;
82645 +       __asm__("lsll %1,%0"
82646 +               :"=r" (__limit):"r" (segment));
82647 +       return __limit+1;
82648 +}
82649 +
82650 +#define nop() __asm__ __volatile__ ("nop")
82651 +
82652 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
82653 +
82654 +#define tas(ptr) (xchg((ptr),1))
82655 +
82656 +struct __xchg_dummy { unsigned long a[100]; };
82657 +#define __xg(x) ((struct __xchg_dummy *)(x))
82658 +
82659 +
82660 +#ifdef CONFIG_X86_CMPXCHG64
82661 +
82662 +/*
82663 + * The semantics of XCHGCMP8B are a bit strange, this is why
82664 + * there is a loop and the loading of %%eax and %%edx has to
82665 + * be inside. This inlines well in most cases, the cached
82666 + * cost is around ~38 cycles. (in the future we might want
82667 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
82668 + * might have an implicit FPU-save as a cost, so it's not
82669 + * clear which path to go.)
82670 + *
82671 + * cmpxchg8b must be used with the lock prefix here to allow
82672 + * the instruction to be executed atomically, see page 3-102
82673 + * of the instruction set reference 24319102.pdf. We need
82674 + * the reader side to see the coherent 64bit value.
82675 + */
82676 +static inline void __set_64bit (unsigned long long * ptr,
82677 +               unsigned int low, unsigned int high)
82678 +{
82679 +       __asm__ __volatile__ (
82680 +               "\n1:\t"
82681 +               "movl (%0), %%eax\n\t"
82682 +               "movl 4(%0), %%edx\n\t"
82683 +               "lock cmpxchg8b (%0)\n\t"
82684 +               "jnz 1b"
82685 +               : /* no outputs */
82686 +               :       "D"(ptr),
82687 +                       "b"(low),
82688 +                       "c"(high)
82689 +               :       "ax","dx","memory");
82690 +}
82691 +
82692 +static inline void __set_64bit_constant (unsigned long long *ptr,
82693 +                                                unsigned long long value)
82694 +{
82695 +       __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
82696 +}
82697 +#define ll_low(x)      *(((unsigned int*)&(x))+0)
82698 +#define ll_high(x)     *(((unsigned int*)&(x))+1)
82699 +
82700 +static inline void __set_64bit_var (unsigned long long *ptr,
82701 +                        unsigned long long value)
82702 +{
82703 +       __set_64bit(ptr,ll_low(value), ll_high(value));
82704 +}
82705 +
82706 +#define set_64bit(ptr,value) \
82707 +(__builtin_constant_p(value) ? \
82708 + __set_64bit_constant(ptr, value) : \
82709 + __set_64bit_var(ptr, value) )
82710 +
82711 +#define _set_64bit(ptr,value) \
82712 +(__builtin_constant_p(value) ? \
82713 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
82714 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
82715 +
82716 +#endif
82717 +
82718 +/*
82719 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
82720 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
82721 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
82722 + */
82723 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
82724 +{
82725 +       switch (size) {
82726 +               case 1:
82727 +                       __asm__ __volatile__("xchgb %b0,%1"
82728 +                               :"=q" (x)
82729 +                               :"m" (*__xg(ptr)), "0" (x)
82730 +                               :"memory");
82731 +                       break;
82732 +               case 2:
82733 +                       __asm__ __volatile__("xchgw %w0,%1"
82734 +                               :"=r" (x)
82735 +                               :"m" (*__xg(ptr)), "0" (x)
82736 +                               :"memory");
82737 +                       break;
82738 +               case 4:
82739 +                       __asm__ __volatile__("xchgl %0,%1"
82740 +                               :"=r" (x)
82741 +                               :"m" (*__xg(ptr)), "0" (x)
82742 +                               :"memory");
82743 +                       break;
82744 +       }
82745 +       return x;
82746 +}
82747 +
82748 +/*
82749 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
82750 + * store NEW in MEM.  Return the initial value in MEM.  Success is
82751 + * indicated by comparing RETURN with OLD.
82752 + */
82753 +
82754 +#ifdef CONFIG_X86_CMPXCHG
82755 +#define __HAVE_ARCH_CMPXCHG 1
82756 +#define cmpxchg(ptr,o,n)\
82757 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
82758 +                                       (unsigned long)(n),sizeof(*(ptr))))
82759 +#define sync_cmpxchg(ptr,o,n)\
82760 +       ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
82761 +                                       (unsigned long)(n),sizeof(*(ptr))))
82762 +#endif
82763 +
82764 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
82765 +                                     unsigned long new, int size)
82766 +{
82767 +       unsigned long prev;
82768 +       switch (size) {
82769 +       case 1:
82770 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
82771 +                                    : "=a"(prev)
82772 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
82773 +                                    : "memory");
82774 +               return prev;
82775 +       case 2:
82776 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
82777 +                                    : "=a"(prev)
82778 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
82779 +                                    : "memory");
82780 +               return prev;
82781 +       case 4:
82782 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
82783 +                                    : "=a"(prev)
82784 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
82785 +                                    : "memory");
82786 +               return prev;
82787 +       }
82788 +       return old;
82789 +}
82790 +
82791 +/*
82792 + * Always use locked operations when touching memory shared with a
82793 + * hypervisor, since the system may be SMP even if the guest kernel
82794 + * isn't.
82795 + */
82796 +static inline unsigned long __sync_cmpxchg(volatile void *ptr,
82797 +                                           unsigned long old,
82798 +                                           unsigned long new, int size)
82799 +{
82800 +       unsigned long prev;
82801 +       switch (size) {
82802 +       case 1:
82803 +               __asm__ __volatile__("lock; cmpxchgb %b1,%2"
82804 +                                    : "=a"(prev)
82805 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
82806 +                                    : "memory");
82807 +               return prev;
82808 +       case 2:
82809 +               __asm__ __volatile__("lock; cmpxchgw %w1,%2"
82810 +                                    : "=a"(prev)
82811 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
82812 +                                    : "memory");
82813 +               return prev;
82814 +       case 4:
82815 +               __asm__ __volatile__("lock; cmpxchgl %1,%2"
82816 +                                    : "=a"(prev)
82817 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
82818 +                                    : "memory");
82819 +               return prev;
82820 +       }
82821 +       return old;
82822 +}
82823 +
82824 +#ifndef CONFIG_X86_CMPXCHG
82825 +/*
82826 + * Building a kernel capable running on 80386. It may be necessary to
82827 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
82828 + * a function for each of the sizes we support.
82829 + */
82830 +
82831 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
82832 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
82833 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
82834 +
82835 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
82836 +                                     unsigned long new, int size)
82837 +{
82838 +       switch (size) {
82839 +       case 1:
82840 +               return cmpxchg_386_u8(ptr, old, new);
82841 +       case 2:
82842 +               return cmpxchg_386_u16(ptr, old, new);
82843 +       case 4:
82844 +               return cmpxchg_386_u32(ptr, old, new);
82845 +       }
82846 +       return old;
82847 +}
82848 +
82849 +#define cmpxchg(ptr,o,n)                                               \
82850 +({                                                                     \
82851 +       __typeof__(*(ptr)) __ret;                                       \
82852 +       if (likely(boot_cpu_data.x86 > 3))                              \
82853 +               __ret = __cmpxchg((ptr), (unsigned long)(o),            \
82854 +                                       (unsigned long)(n), sizeof(*(ptr))); \
82855 +       else                                                            \
82856 +               __ret = cmpxchg_386((ptr), (unsigned long)(o),          \
82857 +                                       (unsigned long)(n), sizeof(*(ptr))); \
82858 +       __ret;                                                          \
82859 +})
82860 +#endif
82861 +
82862 +#ifdef CONFIG_X86_CMPXCHG64
82863 +
82864 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
82865 +                                     unsigned long long new)
82866 +{
82867 +       unsigned long long prev;
82868 +       __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
82869 +                            : "=A"(prev)
82870 +                            : "b"((unsigned long)new),
82871 +                              "c"((unsigned long)(new >> 32)),
82872 +                              "m"(*__xg(ptr)),
82873 +                              "0"(old)
82874 +                            : "memory");
82875 +       return prev;
82876 +}
82877 +
82878 +#define cmpxchg64(ptr,o,n)\
82879 +       ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
82880 +                                       (unsigned long long)(n)))
82881 +
82882 +#endif
82883 +    
82884 +/*
82885 + * Force strict CPU ordering.
82886 + * And yes, this is required on UP too when we're talking
82887 + * to devices.
82888 + *
82889 + * For now, "wmb()" doesn't actually do anything, as all
82890 + * Intel CPU's follow what Intel calls a *Processor Order*,
82891 + * in which all writes are seen in the program order even
82892 + * outside the CPU.
82893 + *
82894 + * I expect future Intel CPU's to have a weaker ordering,
82895 + * but I'd also expect them to finally get their act together
82896 + * and add some real memory barriers if so.
82897 + *
82898 + * Some non intel clones support out of order store. wmb() ceases to be a
82899 + * nop for these.
82900 + */
82901
82902 +
82903 +/* 
82904 + * Actually only lfence would be needed for mb() because all stores done 
82905 + * by the kernel should be already ordered. But keep a full barrier for now. 
82906 + */
82907 +
82908 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
82909 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
82910 +
82911 +/**
82912 + * read_barrier_depends - Flush all pending reads that subsequents reads
82913 + * depend on.
82914 + *
82915 + * No data-dependent reads from memory-like regions are ever reordered
82916 + * over this barrier.  All reads preceding this primitive are guaranteed
82917 + * to access memory (but not necessarily other CPUs' caches) before any
82918 + * reads following this primitive that depend on the data return by
82919 + * any of the preceding reads.  This primitive is much lighter weight than
82920 + * rmb() on most CPUs, and is never heavier weight than is
82921 + * rmb().
82922 + *
82923 + * These ordering constraints are respected by both the local CPU
82924 + * and the compiler.
82925 + *
82926 + * Ordering is not guaranteed by anything other than these primitives,
82927 + * not even by data dependencies.  See the documentation for
82928 + * memory_barrier() for examples and URLs to more information.
82929 + *
82930 + * For example, the following code would force ordering (the initial
82931 + * value of "a" is zero, "b" is one, and "p" is "&a"):
82932 + *
82933 + * <programlisting>
82934 + *     CPU 0                           CPU 1
82935 + *
82936 + *     b = 2;
82937 + *     memory_barrier();
82938 + *     p = &b;                         q = p;
82939 + *                                     read_barrier_depends();
82940 + *                                     d = *q;
82941 + * </programlisting>
82942 + *
82943 + * because the read of "*q" depends on the read of "p" and these
82944 + * two reads are separated by a read_barrier_depends().  However,
82945 + * the following code, with the same initial values for "a" and "b":
82946 + *
82947 + * <programlisting>
82948 + *     CPU 0                           CPU 1
82949 + *
82950 + *     a = 2;
82951 + *     memory_barrier();
82952 + *     b = 3;                          y = b;
82953 + *                                     read_barrier_depends();
82954 + *                                     x = a;
82955 + * </programlisting>
82956 + *
82957 + * does not enforce ordering, since there is no data dependency between
82958 + * the read of "a" and the read of "b".  Therefore, on some CPUs, such
82959 + * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
82960 + * in cases like this where there are no data dependencies.
82961 + **/
82962 +
82963 +#define read_barrier_depends() do { } while(0)
82964 +
82965 +#ifdef CONFIG_X86_OOSTORE
82966 +/* Actually there are no OOO store capable CPUs for now that do SSE, 
82967 +   but make it already an possibility. */
82968 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
82969 +#else
82970 +#define wmb()  __asm__ __volatile__ ("": : :"memory")
82971 +#endif
82972 +
82973 +#ifdef CONFIG_SMP
82974 +#define smp_mb()       mb()
82975 +#define smp_rmb()      rmb()
82976 +#define smp_wmb()      wmb()
82977 +#define smp_read_barrier_depends()     read_barrier_depends()
82978 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
82979 +#else
82980 +#define smp_mb()       barrier()
82981 +#define smp_rmb()      barrier()
82982 +#define smp_wmb()      barrier()
82983 +#define smp_read_barrier_depends()     do { } while(0)
82984 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
82985 +#endif
82986 +
82987 +#include <linux/irqflags.h>
82988 +
82989 +/*
82990 + * disable hlt during certain critical i/o operations
82991 + */
82992 +#define HAVE_DISABLE_HLT
82993 +void disable_hlt(void);
82994 +void enable_hlt(void);
82995 +
82996 +extern int es7000_plat;
82997 +void cpu_idle_wait(void);
82998 +
82999 +/*
83000 + * On SMP systems, when the scheduler does migration-cost autodetection,
83001 + * it needs a way to flush as much of the CPU's caches as possible:
83002 + */
83003 +static inline void sched_cacheflush(void)
83004 +{
83005 +       wbinvd();
83006 +}
83007 +
83008 +extern unsigned long arch_align_stack(unsigned long sp);
83009 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
83010 +
83011 +void default_idle(void);
83012 +
83013 +#endif
83014 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/timer.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/timer.h
83015 --- linux-2.6.19/include/asm-i386/mach-xen/asm/timer.h  1970-01-01 00:00:00.000000000 +0000
83016 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/timer.h        2007-02-02 19:10:55.000000000 +0000
83017 @@ -0,0 +1,67 @@
83018 +#ifndef _ASMi386_TIMER_H
83019 +#define _ASMi386_TIMER_H
83020 +#include <linux/init.h>
83021 +#include <linux/pm.h>
83022 +
83023 +/**
83024 + * struct timer_ops - used to define a timer source
83025 + *
83026 + * @name: name of the timer.
83027 + * @init: Probes and initializes the timer. Takes clock= override 
83028 + *        string as an argument. Returns 0 on success, anything else
83029 + *        on failure.
83030 + * @mark_offset: called by the timer interrupt.
83031 + * @get_offset:  called by gettimeofday(). Returns the number of microseconds
83032 + *               since the last timer interupt.
83033 + * @monotonic_clock: returns the number of nanoseconds since the init of the
83034 + *                   timer.
83035 + * @delay: delays this many clock cycles.
83036 + */
83037 +struct timer_opts {
83038 +       char* name;
83039 +       void (*mark_offset)(void);
83040 +       unsigned long (*get_offset)(void);
83041 +       unsigned long long (*monotonic_clock)(void);
83042 +       void (*delay)(unsigned long);
83043 +       unsigned long (*read_timer)(void);
83044 +       int (*suspend)(pm_message_t state);
83045 +       int (*resume)(void);
83046 +};
83047 +
83048 +struct init_timer_opts {
83049 +       int (*init)(char *override);
83050 +       struct timer_opts *opts;
83051 +};
83052 +
83053 +#define TICK_SIZE (tick_nsec / 1000)
83054 +
83055 +extern struct timer_opts* __init select_timer(void);
83056 +extern void clock_fallback(void);
83057 +void setup_pit_timer(void);
83058 +/* Modifiers for buggy PIT handling */
83059 +extern int pit_latch_buggy;
83060 +extern struct timer_opts *cur_timer;
83061 +extern int timer_ack;
83062 +
83063 +/* list of externed timers */
83064 +extern struct timer_opts timer_none;
83065 +extern struct timer_opts timer_pit;
83066 +extern struct init_timer_opts timer_pit_init;
83067 +extern struct init_timer_opts timer_tsc_init;
83068 +#ifdef CONFIG_X86_CYCLONE_TIMER
83069 +extern struct init_timer_opts timer_cyclone_init;
83070 +#endif
83071 +
83072 +extern unsigned long calibrate_tsc(void);
83073 +extern unsigned long read_timer_tsc(void);
83074 +extern void init_cpu_khz(void);
83075 +extern int recalibrate_cpu_khz(void);
83076 +#ifdef CONFIG_HPET_TIMER
83077 +extern struct init_timer_opts timer_hpet_init;
83078 +extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
83079 +#endif
83080 +
83081 +#ifdef CONFIG_X86_PM_TIMER
83082 +extern struct init_timer_opts timer_pmtmr_init;
83083 +#endif
83084 +#endif
83085 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/tlbflush.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/tlbflush.h
83086 --- linux-2.6.19/include/asm-i386/mach-xen/asm/tlbflush.h       1970-01-01 00:00:00.000000000 +0000
83087 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/tlbflush.h     2007-02-02 19:10:55.000000000 +0000
83088 @@ -0,0 +1,99 @@
83089 +#ifndef _I386_TLBFLUSH_H
83090 +#define _I386_TLBFLUSH_H
83091 +
83092 +#include <linux/mm.h>
83093 +#include <asm/processor.h>
83094 +
83095 +#define __flush_tlb() xen_tlb_flush()
83096 +#define __flush_tlb_global() xen_tlb_flush()
83097 +#define __flush_tlb_all() xen_tlb_flush()
83098 +
83099 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
83100 +
83101 +#define __flush_tlb_single(addr) xen_invlpg(addr)
83102 +
83103 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
83104 +
83105 +/*
83106 + * TLB flushing:
83107 + *
83108 + *  - flush_tlb() flushes the current mm struct TLBs
83109 + *  - flush_tlb_all() flushes all processes TLBs
83110 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
83111 + *  - flush_tlb_page(vma, vmaddr) flushes one page
83112 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
83113 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
83114 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
83115 + *
83116 + * ..but the i386 has somewhat limited tlb flushing capabilities,
83117 + * and page-granular flushes are available only on i486 and up.
83118 + */
83119 +
83120 +#ifndef CONFIG_SMP
83121 +
83122 +#define flush_tlb() __flush_tlb()
83123 +#define flush_tlb_all() __flush_tlb_all()
83124 +#define local_flush_tlb() __flush_tlb()
83125 +
83126 +static inline void flush_tlb_mm(struct mm_struct *mm)
83127 +{
83128 +       if (mm == current->active_mm)
83129 +               __flush_tlb();
83130 +}
83131 +
83132 +static inline void flush_tlb_page(struct vm_area_struct *vma,
83133 +       unsigned long addr)
83134 +{
83135 +       if (vma->vm_mm == current->active_mm)
83136 +               __flush_tlb_one(addr);
83137 +}
83138 +
83139 +static inline void flush_tlb_range(struct vm_area_struct *vma,
83140 +       unsigned long start, unsigned long end)
83141 +{
83142 +       if (vma->vm_mm == current->active_mm)
83143 +               __flush_tlb();
83144 +}
83145 +
83146 +#else
83147 +
83148 +#include <asm/smp.h>
83149 +
83150 +#define local_flush_tlb() \
83151 +       __flush_tlb()
83152 +
83153 +extern void flush_tlb_all(void);
83154 +extern void flush_tlb_current_task(void);
83155 +extern void flush_tlb_mm(struct mm_struct *);
83156 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
83157 +
83158 +#define flush_tlb()    flush_tlb_current_task()
83159 +
83160 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
83161 +{
83162 +       flush_tlb_mm(vma->vm_mm);
83163 +}
83164 +
83165 +#define TLBSTATE_OK    1
83166 +#define TLBSTATE_LAZY  2
83167 +
83168 +struct tlb_state
83169 +{
83170 +       struct mm_struct *active_mm;
83171 +       int state;
83172 +       char __cacheline_padding[L1_CACHE_BYTES-8];
83173 +};
83174 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
83175 +
83176 +
83177 +#endif
83178 +
83179 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
83180 +
83181 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
83182 +                                     unsigned long start, unsigned long end)
83183 +{
83184 +       /* i386 does not keep any page table caches in TLB */
83185 +}
83186 +
83187 +#endif /* _I386_TLBFLUSH_H */
83188 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/vga.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/vga.h
83189 --- linux-2.6.19/include/asm-i386/mach-xen/asm/vga.h    1970-01-01 00:00:00.000000000 +0000
83190 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/vga.h  2007-02-02 19:10:55.000000000 +0000
83191 @@ -0,0 +1,20 @@
83192 +/*
83193 + *     Access to VGA videoram
83194 + *
83195 + *     (c) 1998 Martin Mares <mj@ucw.cz>
83196 + */
83197 +
83198 +#ifndef _LINUX_ASM_VGA_H_
83199 +#define _LINUX_ASM_VGA_H_
83200 +
83201 +/*
83202 + *     On the PC, we can just recalculate addresses and then
83203 + *     access the videoram directly without any black magic.
83204 + */
83205 +
83206 +#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
83207 +
83208 +#define vga_readb(x) (*(x))
83209 +#define vga_writeb(x,y) (*(y) = (x))
83210 +
83211 +#endif
83212 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/asm/xenoprof.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/xenoprof.h
83213 --- linux-2.6.19/include/asm-i386/mach-xen/asm/xenoprof.h       1970-01-01 00:00:00.000000000 +0000
83214 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/asm/xenoprof.h     2007-02-02 19:10:55.000000000 +0000
83215 @@ -0,0 +1,48 @@
83216 +/******************************************************************************
83217 + * asm-i386/mach-xen/asm/xenoprof.h
83218 + *
83219 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
83220 + *                    VA Linux Systems Japan K.K.
83221 + *
83222 + * This program is free software; you can redistribute it and/or modify
83223 + * it under the terms of the GNU General Public License as published by
83224 + * the Free Software Foundation; either version 2 of the License, or
83225 + * (at your option) any later version.
83226 + *
83227 + * This program is distributed in the hope that it will be useful,
83228 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
83229 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
83230 + * GNU General Public License for more details.
83231 + *
83232 + * You should have received a copy of the GNU General Public License
83233 + * along with this program; if not, write to the Free Software
83234 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
83235 + *
83236 + */
83237 +#ifndef __ASM_XENOPROF_H__
83238 +#define __ASM_XENOPROF_H__
83239 +#ifdef CONFIG_XEN
83240 +
83241 +struct super_block;
83242 +struct dentry;
83243 +int xenoprof_create_files(struct super_block * sb, struct dentry * root);
83244 +#define HAVE_XENOPROF_CREATE_FILES
83245 +
83246 +struct xenoprof_init;
83247 +void xenoprof_arch_init_counter(struct xenoprof_init *init);
83248 +void xenoprof_arch_counter(void);
83249 +void xenoprof_arch_start(void);
83250 +void xenoprof_arch_stop(void);
83251 +
83252 +struct xenoprof_arch_shared_buffer {
83253 +       /* nothing */
83254 +};
83255 +struct xenoprof_shared_buffer;
83256 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
83257 +struct xenoprof_get_buffer;
83258 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
83259 +struct xenoprof_passive;
83260 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
83261 +
83262 +#endif /* CONFIG_XEN */
83263 +#endif /* __ASM_XENOPROF_H__ */
83264 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/irq_vectors.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/irq_vectors.h
83265 --- linux-2.6.19/include/asm-i386/mach-xen/irq_vectors.h        1970-01-01 00:00:00.000000000 +0000
83266 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/irq_vectors.h      2007-02-02 19:10:55.000000000 +0000
83267 @@ -0,0 +1,125 @@
83268 +/*
83269 + * This file should contain #defines for all of the interrupt vector
83270 + * numbers used by this architecture.
83271 + *
83272 + * In addition, there are some standard defines:
83273 + *
83274 + *     FIRST_EXTERNAL_VECTOR:
83275 + *             The first free place for external interrupts
83276 + *
83277 + *     SYSCALL_VECTOR:
83278 + *             The IRQ vector a syscall makes the user to kernel transition
83279 + *             under.
83280 + *
83281 + *     TIMER_IRQ:
83282 + *             The IRQ number the timer interrupt comes in at.
83283 + *
83284 + *     NR_IRQS:
83285 + *             The total number of interrupt vectors (including all the
83286 + *             architecture specific interrupts) needed.
83287 + *
83288 + */                    
83289 +#ifndef _ASM_IRQ_VECTORS_H
83290 +#define _ASM_IRQ_VECTORS_H
83291 +
83292 +/*
83293 + * IDT vectors usable for external interrupt sources start
83294 + * at 0x20:
83295 + */
83296 +#define FIRST_EXTERNAL_VECTOR  0x20
83297 +
83298 +#define SYSCALL_VECTOR         0x80
83299 +
83300 +/*
83301 + * Vectors 0x20-0x2f are used for ISA interrupts.
83302 + */
83303 +
83304 +#if 0
83305 +/*
83306 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
83307 + *
83308 + *  some of the following vectors are 'rare', they are merged
83309 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
83310 + *  TLB, reschedule and local APIC vectors are performance-critical.
83311 + *
83312 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
83313 + */
83314 +#define SPURIOUS_APIC_VECTOR   0xff
83315 +#define ERROR_APIC_VECTOR      0xfe
83316 +#define INVALIDATE_TLB_VECTOR  0xfd
83317 +#define RESCHEDULE_VECTOR      0xfc
83318 +#define CALL_FUNCTION_VECTOR   0xfb
83319 +
83320 +#define THERMAL_APIC_VECTOR    0xf0
83321 +/*
83322 + * Local APIC timer IRQ vector is on a different priority level,
83323 + * to work around the 'lost local interrupt if more than 2 IRQ
83324 + * sources per level' errata.
83325 + */
83326 +#define LOCAL_TIMER_VECTOR     0xef
83327 +#endif
83328 +
83329 +#define SPURIOUS_APIC_VECTOR   0xff
83330 +#define ERROR_APIC_VECTOR      0xfe
83331 +
83332 +/*
83333 + * First APIC vector available to drivers: (vectors 0x30-0xee)
83334 + * we start at 0x31 to spread out vectors evenly between priority
83335 + * levels. (0x80 is the syscall vector)
83336 + */
83337 +#define FIRST_DEVICE_VECTOR    0x31
83338 +#define FIRST_SYSTEM_VECTOR    0xef
83339 +
83340 +/*
83341 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
83342 + * Right now the APIC is mostly only used for SMP.
83343 + * 256 vectors is an architectural limit. (we can have
83344 + * more than 256 devices theoretically, but they will
83345 + * have to use shared interrupts)
83346 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
83347 + * the usable vector space is 0x20-0xff (224 vectors)
83348 + */
83349 +
83350 +#define RESCHEDULE_VECTOR      0
83351 +#define CALL_FUNCTION_VECTOR   1
83352 +#define NR_IPIS                        2
83353 +
83354 +/*
83355 + * The maximum number of vectors supported by i386 processors
83356 + * is limited to 256. For processors other than i386, NR_VECTORS
83357 + * should be changed accordingly.
83358 + */
83359 +#define NR_VECTORS 256
83360 +
83361 +#define FPU_IRQ                        13
83362 +
83363 +#define        FIRST_VM86_IRQ          3
83364 +#define LAST_VM86_IRQ          15
83365 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
83366 +
83367 +/*
83368 + * The flat IRQ space is divided into two regions:
83369 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
83370 + *     if we have physical device-access privilege. This region is at the 
83371 + *     start of the IRQ space so that existing device drivers do not need
83372 + *     to be modified to translate physical IRQ numbers into our IRQ space.
83373 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
83374 + *     are bound using the provided bind/unbind functions.
83375 + */
83376 +
83377 +#define PIRQ_BASE              0
83378 +#define NR_PIRQS               256
83379 +
83380 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
83381 +#define NR_DYNIRQS             256
83382 +
83383 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
83384 +#define NR_IRQ_VECTORS         NR_IRQS
83385 +
83386 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
83387 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
83388 +
83389 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
83390 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
83391 +
83392 +#endif /* _ASM_IRQ_VECTORS_H */
83393 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/mach_traps.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/mach_traps.h
83394 --- linux-2.6.19/include/asm-i386/mach-xen/mach_traps.h 1970-01-01 00:00:00.000000000 +0000
83395 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/mach_traps.h       2007-02-02 19:10:55.000000000 +0000
83396 @@ -0,0 +1,33 @@
83397 +/*
83398 + *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
83399 + *
83400 + *  Machine specific NMI handling for Xen
83401 + */
83402 +#ifndef _MACH_TRAPS_H
83403 +#define _MACH_TRAPS_H
83404 +
83405 +#include <linux/bitops.h>
83406 +#include <xen/interface/nmi.h>
83407 +
83408 +static inline void clear_mem_error(unsigned char reason) {}
83409 +static inline void clear_io_check_error(unsigned char reason) {}
83410 +
83411 +static inline unsigned char get_nmi_reason(void)
83412 +{
83413 +       shared_info_t *s = HYPERVISOR_shared_info;
83414 +       unsigned char reason = 0;
83415 +
83416 +       /* construct a value which looks like it came from
83417 +        * port 0x61.
83418 +        */
83419 +       if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
83420 +               reason |= 0x40;
83421 +       if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
83422 +               reason |= 0x80;
83423 +
83424 +        return reason;
83425 +}
83426 +
83427 +static inline void reassert_nmi(void) {}
83428 +
83429 +#endif /* !_MACH_TRAPS_H */
83430 diff -ruNp linux-2.6.19/include/asm-i386/mach-xen/setup_arch.h linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/setup_arch.h
83431 --- linux-2.6.19/include/asm-i386/mach-xen/setup_arch.h 1970-01-01 00:00:00.000000000 +0000
83432 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/mach-xen/setup_arch.h       2007-02-02 19:10:55.000000000 +0000
83433 @@ -0,0 +1,5 @@
83434 +/* Hook to call BIOS initialisation function */
83435 +
83436 +#define ARCH_SETUP machine_specific_arch_setup();
83437 +
83438 +void __init machine_specific_arch_setup(void);
83439 diff -ruNp linux-2.6.19/include/asm-i386/pgtable-2level-defs.h linux-2.6.19-xen-3.0.4/include/asm-i386/pgtable-2level-defs.h
83440 --- linux-2.6.19/include/asm-i386/pgtable-2level-defs.h 2006-11-29 21:57:37.000000000 +0000
83441 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/pgtable-2level-defs.h       2007-02-02 19:10:55.000000000 +0000
83442 @@ -1,6 +1,8 @@
83443  #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
83444  #define _I386_PGTABLE_2LEVEL_DEFS_H
83445  
83446 +#define HAVE_SHARED_KERNEL_PMD 0
83447 +
83448  /*
83449   * traditional i386 two-level paging structure:
83450   */
83451 diff -ruNp linux-2.6.19/include/asm-i386/pgtable-3level-defs.h linux-2.6.19-xen-3.0.4/include/asm-i386/pgtable-3level-defs.h
83452 --- linux-2.6.19/include/asm-i386/pgtable-3level-defs.h 2006-11-29 21:57:37.000000000 +0000
83453 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/pgtable-3level-defs.h       2007-02-02 19:10:55.000000000 +0000
83454 @@ -1,6 +1,8 @@
83455  #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
83456  #define _I386_PGTABLE_3LEVEL_DEFS_H
83457  
83458 +#define HAVE_SHARED_KERNEL_PMD 1
83459 +
83460  /*
83461   * PGDIR_SHIFT determines what a top-level page table entry can map
83462   */
83463 diff -ruNp linux-2.6.19/include/asm-i386/spinlock.h linux-2.6.19-xen-3.0.4/include/asm-i386/spinlock.h
83464 --- linux-2.6.19/include/asm-i386/spinlock.h    2006-11-29 21:57:37.000000000 +0000
83465 +++ linux-2.6.19-xen-3.0.4/include/asm-i386/spinlock.h  2007-02-02 19:10:55.000000000 +0000
83466 @@ -7,8 +7,13 @@
83467  #include <asm/processor.h>
83468  #include <linux/compiler.h>
83469  
83470 +#ifdef CONFIG_XEN
83471 +#define CLI_STRING
83472 +#define STI_STRING
83473 +#else
83474  #define CLI_STRING     "cli"
83475  #define STI_STRING     "sti"
83476 +#endif
83477  
83478  /*
83479   * Your basic SMP spinlocks, allowing only a single CPU anywhere
83480 diff -ruNp linux-2.6.19/include/asm-ia64/agp.h linux-2.6.19-xen-3.0.4/include/asm-ia64/agp.h
83481 --- linux-2.6.19/include/asm-ia64/agp.h 2006-11-29 21:57:37.000000000 +0000
83482 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/agp.h       2007-02-02 19:10:55.000000000 +0000
83483 @@ -19,13 +19,44 @@
83484  #define flush_agp_cache()              mb()
83485  
83486  /* Convert a physical address to an address suitable for the GART. */
83487 +#ifndef CONFIG_XEN
83488  #define phys_to_gart(x) (x)
83489  #define gart_to_phys(x) (x)
83490 +#else
83491 +#define phys_to_gart(x) phys_to_machine_for_dma(x)
83492 +#define gart_to_phys(x) machine_to_phys_for_dma(x)
83493 +#endif
83494  
83495  /* GATT allocation. Returns/accepts GATT kernel virtual address. */
83496 +#ifndef CONFIG_XEN
83497  #define alloc_gatt_pages(order)                \
83498         ((char *)__get_free_pages(GFP_KERNEL, (order)))
83499  #define free_gatt_pages(table, order)  \
83500         free_pages((unsigned long)(table), (order))
83501 +#else
83502 +#include <asm/hypervisor.h>
83503 +static inline char*
83504 +alloc_gatt_pages(unsigned int order)
83505 +{
83506 +       unsigned long error;
83507 +       unsigned long ret = __get_free_pages(GFP_KERNEL, (order));
83508 +       if (ret == 0) {
83509 +               goto out;
83510 +       }
83511 +       error = xen_create_contiguous_region(ret, order, 0);
83512 +       if (error) {
83513 +               free_pages(ret, order);
83514 +               ret = 0;
83515 +       }
83516 +out:
83517 +       return (char*)ret;
83518 +}
83519 +static inline void
83520 +free_gatt_pages(void* table, unsigned int order)
83521 +{
83522 +       xen_destroy_contiguous_region((unsigned long)table, order);
83523 +       free_pages((unsigned long)table, order);
83524 +}
83525 +#endif /* CONFIG_XEN */
83526  
83527  #endif /* _ASM_IA64_AGP_H */
83528 diff -ruNp linux-2.6.19/include/asm-ia64/dma-mapping.h linux-2.6.19-xen-3.0.4/include/asm-ia64/dma-mapping.h
83529 --- linux-2.6.19/include/asm-ia64/dma-mapping.h 2006-11-29 21:57:37.000000000 +0000
83530 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/dma-mapping.h       2007-02-02 19:10:55.000000000 +0000
83531 @@ -6,20 +6,67 @@
83532   *     David Mosberger-Tang <davidm@hpl.hp.com>
83533   */
83534  #include <asm/machvec.h>
83535 +#ifndef CONFIG_XEN
83536  
83537 -#define dma_alloc_coherent     platform_dma_alloc_coherent
83538 -#define dma_alloc_noncoherent  platform_dma_alloc_coherent     /* coherent mem. is cheap */
83539 -#define dma_free_coherent      platform_dma_free_coherent
83540 -#define dma_free_noncoherent   platform_dma_free_coherent
83541 -#define dma_map_single         platform_dma_map_single
83542 -#define dma_map_sg             platform_dma_map_sg
83543 -#define dma_unmap_single       platform_dma_unmap_single
83544 -#define dma_unmap_sg           platform_dma_unmap_sg
83545 -#define dma_sync_single_for_cpu        platform_dma_sync_single_for_cpu
83546 -#define dma_sync_sg_for_cpu    platform_dma_sync_sg_for_cpu
83547 +#define dma_alloc_coherent      platform_dma_alloc_coherent
83548 +#define dma_alloc_noncoherent   platform_dma_alloc_coherent     /* coherent mem. is cheap */
83549 +#define dma_free_coherent       platform_dma_free_coherent
83550 +#define dma_free_noncoherent    platform_dma_free_coherent
83551 +#define dma_map_single          platform_dma_map_single
83552 +#define dma_map_sg              platform_dma_map_sg
83553 +#define dma_unmap_single        platform_dma_unmap_single
83554 +#define dma_unmap_sg            platform_dma_unmap_sg
83555 +#define dma_sync_single_for_cpu platform_dma_sync_single_for_cpu
83556 +#define dma_sync_sg_for_cpu     platform_dma_sync_sg_for_cpu
83557  #define dma_sync_single_for_device platform_dma_sync_single_for_device
83558 -#define dma_sync_sg_for_device platform_dma_sync_sg_for_device
83559 -#define dma_mapping_error      platform_dma_mapping_error
83560 +#define dma_sync_sg_for_device  platform_dma_sync_sg_for_device
83561 +#define dma_mapping_error       platform_dma_mapping_error
83562 +
83563 +#else /* CONFIG_XEN */
83564 +/* Needed for arch/i386/kernel/swiotlb.c and arch/i386/kernel/pci-dma-xen.c */
83565 +#include <asm/hypervisor.h>
83566 +/* Needed for arch/i386/kernel/swiotlb.c */
83567 +#include <asm-i386/mach-xen/asm/swiotlb.h>
83568 +
83569 +int dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
83570 +               enum dma_data_direction direction);
83571 +void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
83572 +                  enum dma_data_direction direction);
83573 +int dma_supported(struct device *dev, u64 mask);
83574 +void *dma_alloc_coherent(struct device *dev, size_t size,
83575 +                         dma_addr_t *dma_handle, gfp_t gfp);
83576 +void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
83577 +                       dma_addr_t dma_handle);
83578 +dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
83579 +                          enum dma_data_direction direction);
83580 +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
83581 +                      enum dma_data_direction direction);
83582 +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
83583 +                             size_t size, enum dma_data_direction direction);
83584 +void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
83585 +                                size_t size,
83586 +                                enum dma_data_direction direction);
83587 +int dma_mapping_error(dma_addr_t dma_addr);
83588 +
83589 +#define flush_write_buffers()  do { } while (0)
83590 +static inline void
83591 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
83592 +                    enum dma_data_direction direction)
83593 +{
83594 +       if (swiotlb)
83595 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
83596 +       flush_write_buffers();
83597 +}
83598 +
83599 +static inline void
83600 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
83601 +                       enum dma_data_direction direction)
83602 +{
83603 +       if (swiotlb)
83604 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
83605 +       flush_write_buffers();
83606 +}
83607 +#endif /* CONFIG_XEN */
83608  
83609  #define dma_map_page(dev, pg, off, size, dir)                          \
83610         dma_map_single(dev, page_address(pg) + (off), (size), (dir))
83611 @@ -36,7 +83,9 @@
83612  #define dma_sync_single_range_for_device(dev, dma_handle, offset, size, dir)   \
83613         dma_sync_single_for_device(dev, dma_handle, size, dir)
83614  
83615 +#ifndef CONFIG_XEN
83616  #define dma_supported          platform_dma_supported
83617 +#endif
83618  
83619  static inline int
83620  dma_set_mask (struct device *dev, u64 mask)
83621 @@ -61,4 +110,29 @@ dma_cache_sync (void *vaddr, size_t size
83622  
83623  #define dma_is_consistent(dma_handle)  (1)     /* all we do is coherent memory... */
83624  
83625 +#ifdef CONFIG_XEN
83626 +/* arch/i386/kernel/swiotlb.o requires */
83627 +void contiguous_bitmap_init(unsigned long end_pfn);
83628 +
83629 +static inline int
83630 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
83631 +{
83632 +       dma_addr_t mask = DMA_64BIT_MASK;
83633 +       /* If the device has a mask, use it, otherwise default to 64 bits */
83634 +       if (hwdev && hwdev->dma_mask)
83635 +               mask = *hwdev->dma_mask;
83636 +       return (addr & ~mask) != 0;
83637 +}
83638 +#else
83639 +#define contiguous_bitmap_init(end_pfn)        ((void)end_pfn)
83640 +#endif
83641 +
83642 +static inline int
83643 +range_straddles_page_boundary(void *p, size_t size)
83644 +{
83645 +       extern unsigned long *contiguous_bitmap;
83646 +       return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
83647 +               !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
83648 +}
83649 +
83650  #endif /* _ASM_IA64_DMA_MAPPING_H */
83651 diff -ruNp linux-2.6.19/include/asm-ia64/fixmap.h linux-2.6.19-xen-3.0.4/include/asm-ia64/fixmap.h
83652 --- linux-2.6.19/include/asm-ia64/fixmap.h      1970-01-01 00:00:00.000000000 +0000
83653 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/fixmap.h    2007-02-02 19:10:55.000000000 +0000
83654 @@ -0,0 +1,2 @@
83655 +#define clear_fixmap(x)        do {} while (0)
83656 +#define        set_fixmap(x,y) do {} while (0)
83657 diff -ruNp linux-2.6.19/include/asm-ia64/gcc_intrin.h linux-2.6.19-xen-3.0.4/include/asm-ia64/gcc_intrin.h
83658 --- linux-2.6.19/include/asm-ia64/gcc_intrin.h  2006-11-29 21:57:37.000000000 +0000
83659 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/gcc_intrin.h        2007-02-02 19:10:55.000000000 +0000
83660 @@ -26,7 +26,7 @@ extern void ia64_bad_param_for_getreg (v
83661  
83662  register unsigned long ia64_r13 asm ("r13") __attribute_used__;
83663  
83664 -#define ia64_setreg(regnum, val)                                               \
83665 +#define __ia64_setreg(regnum, val)                                             \
83666  ({                                                                             \
83667         switch (regnum) {                                                       \
83668             case _IA64_REG_PSR_L:                                               \
83669 @@ -55,7 +55,7 @@ register unsigned long ia64_r13 asm ("r1
83670         }                                                                       \
83671  })
83672  
83673 -#define ia64_getreg(regnum)                                                    \
83674 +#define __ia64_getreg(regnum)                                                  \
83675  ({                                                                             \
83676         __u64 ia64_intri_res;                                                   \
83677                                                                                 \
83678 @@ -92,7 +92,7 @@ register unsigned long ia64_r13 asm ("r1
83679  
83680  #define ia64_hint_pause 0
83681  
83682 -#define ia64_hint(mode)                                                \
83683 +#define __ia64_hint(mode)                                              \
83684  ({                                                             \
83685         switch (mode) {                                         \
83686         case ia64_hint_pause:                                   \
83687 @@ -374,7 +374,7 @@ register unsigned long ia64_r13 asm ("r1
83688  
83689  #define ia64_invala() asm volatile ("invala" ::: "memory")
83690  
83691 -#define ia64_thash(addr)                                                       \
83692 +#define __ia64_thash(addr)                                                     \
83693  ({                                                                             \
83694         __u64 ia64_intri_res;                                                   \
83695         asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr));       \
83696 @@ -394,18 +394,18 @@ register unsigned long ia64_r13 asm ("r1
83697  
83698  #define ia64_nop(x)    asm volatile ("nop %0"::"i"(x));
83699  
83700 -#define ia64_itci(addr)        asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
83701 +#define __ia64_itci(addr)      asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
83702  
83703 -#define ia64_itcd(addr)        asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
83704 +#define __ia64_itcd(addr)      asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
83705  
83706  
83707 -#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                                \
83708 +#define __ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                      \
83709                                              :: "r"(trnum), "r"(addr) : "memory")
83710  
83711 -#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                                \
83712 +#define __ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                      \
83713                                              :: "r"(trnum), "r"(addr) : "memory")
83714  
83715 -#define ia64_tpa(addr)                                                         \
83716 +#define __ia64_tpa(addr)                                                       \
83717  ({                                                                             \
83718         __u64 ia64_pa;                                                          \
83719         asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory");    \
83720 @@ -415,22 +415,22 @@ register unsigned long ia64_r13 asm ("r1
83721  #define __ia64_set_dbr(index, val)                                             \
83722         asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
83723  
83724 -#define ia64_set_ibr(index, val)                                               \
83725 +#define __ia64_set_ibr(index, val)                                             \
83726         asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
83727  
83728 -#define ia64_set_pkr(index, val)                                               \
83729 +#define __ia64_set_pkr(index, val)                                             \
83730         asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
83731  
83732 -#define ia64_set_pmc(index, val)                                               \
83733 +#define __ia64_set_pmc(index, val)                                             \
83734         asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
83735  
83736 -#define ia64_set_pmd(index, val)                                               \
83737 +#define __ia64_set_pmd(index, val)                                             \
83738         asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
83739  
83740 -#define ia64_set_rr(index, val)                                                        \
83741 +#define __ia64_set_rr(index, val)                                                      \
83742         asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
83743  
83744 -#define ia64_get_cpuid(index)                                                          \
83745 +#define __ia64_get_cpuid(index)                                                                \
83746  ({                                                                                     \
83747         __u64 ia64_intri_res;                                                           \
83748         asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index));        \
83749 @@ -444,21 +444,21 @@ register unsigned long ia64_r13 asm ("r1
83750         ia64_intri_res;                                                         \
83751  })
83752  
83753 -#define ia64_get_ibr(index)                                                    \
83754 +#define __ia64_get_ibr(index)                                                  \
83755  ({                                                                             \
83756         __u64 ia64_intri_res;                                                   \
83757         asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
83758         ia64_intri_res;                                                         \
83759  })
83760  
83761 -#define ia64_get_pkr(index)                                                    \
83762 +#define __ia64_get_pkr(index)                                                  \
83763  ({                                                                             \
83764         __u64 ia64_intri_res;                                                   \
83765         asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
83766         ia64_intri_res;                                                         \
83767  })
83768  
83769 -#define ia64_get_pmc(index)                                                    \
83770 +#define __ia64_get_pmc(index)                                                  \
83771  ({                                                                             \
83772         __u64 ia64_intri_res;                                                   \
83773         asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
83774 @@ -466,48 +466,48 @@ register unsigned long ia64_r13 asm ("r1
83775  })
83776  
83777  
83778 -#define ia64_get_pmd(index)                                                    \
83779 +#define __ia64_get_pmd(index)                                                  \
83780  ({                                                                             \
83781         __u64 ia64_intri_res;                                                   \
83782         asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
83783         ia64_intri_res;                                                         \
83784  })
83785  
83786 -#define ia64_get_rr(index)                                                     \
83787 +#define __ia64_get_rr(index)                                                   \
83788  ({                                                                             \
83789         __u64 ia64_intri_res;                                                   \
83790         asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index));    \
83791         ia64_intri_res;                                                         \
83792  })
83793  
83794 -#define ia64_fc(addr)  asm volatile ("fc %0" :: "r"(addr) : "memory")
83795 +#define __ia64_fc(addr)        asm volatile ("fc %0" :: "r"(addr) : "memory")
83796  
83797  
83798  #define ia64_sync_i()  asm volatile (";; sync.i" ::: "memory")
83799  
83800 -#define ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
83801 -#define ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
83802 +#define __ia64_ssm(mask)       asm volatile ("ssm %0":: "i"((mask)) : "memory")
83803 +#define __ia64_rsm(mask)       asm volatile ("rsm %0":: "i"((mask)) : "memory")
83804  #define ia64_sum(mask) asm volatile ("sum %0":: "i"((mask)) : "memory")
83805  #define ia64_rum(mask) asm volatile ("rum %0":: "i"((mask)) : "memory")
83806  
83807 -#define ia64_ptce(addr)        asm volatile ("ptc.e %0" :: "r"(addr))
83808 +#define __ia64_ptce(addr)      asm volatile ("ptc.e %0" :: "r"(addr))
83809  
83810 -#define ia64_ptcga(addr, size)                                                 \
83811 +#define __ia64_ptcga(addr, size)                                                       \
83812  do {                                                                           \
83813         asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory");       \
83814         ia64_dv_serialize_data();                                               \
83815  } while (0)
83816  
83817 -#define ia64_ptcl(addr, size)                                                  \
83818 +#define __ia64_ptcl(addr, size)                                                        \
83819  do {                                                                           \
83820         asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory");        \
83821         ia64_dv_serialize_data();                                               \
83822  } while (0)
83823  
83824 -#define ia64_ptri(addr, size)                                          \
83825 +#define __ia64_ptri(addr, size)                                                \
83826         asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
83827  
83828 -#define ia64_ptrd(addr, size)                                          \
83829 +#define __ia64_ptrd(addr, size)                                                \
83830         asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
83831  
83832  /* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
83833 @@ -589,7 +589,7 @@ do {                                                                                \
83834          }                                                              \
83835  })
83836  
83837 -#define ia64_intrin_local_irq_restore(x)                       \
83838 +#define __ia64_intrin_local_irq_restore(x)                     \
83839  do {                                                           \
83840         asm volatile (";;   cmp.ne p6,p7=%0,r0;;"               \
83841                       "(p6) ssm psr.i;"                         \
83842 @@ -598,4 +598,6 @@ do {                                                                \
83843                       :: "r"((x)) : "p6", "p7", "memory");      \
83844  } while (0)
83845  
83846 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
83847 +
83848  #endif /* _ASM_IA64_GCC_INTRIN_H */
83849 diff -ruNp linux-2.6.19/include/asm-ia64/hw_irq.h linux-2.6.19-xen-3.0.4/include/asm-ia64/hw_irq.h
83850 --- linux-2.6.19/include/asm-ia64/hw_irq.h      2006-11-29 21:57:37.000000000 +0000
83851 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/hw_irq.h    2007-02-02 19:10:55.000000000 +0000
83852 @@ -15,7 +15,11 @@
83853  #include <asm/ptrace.h>
83854  #include <asm/smp.h>
83855  
83856 +#ifndef CONFIG_XEN
83857  typedef u8 ia64_vector;
83858 +#else
83859 +typedef u16 ia64_vector;
83860 +#endif
83861  
83862  /*
83863   * 0 special
83864 @@ -99,6 +103,12 @@ extern void register_percpu_irq (ia64_ve
83865  
83866  static inline void ia64_resend_irq(unsigned int vector)
83867  {
83868 +#ifdef CONFIG_XEN
83869 +       extern void resend_irq_on_evtchn(unsigned int i);
83870 +       if (is_running_on_xen())
83871 +               resend_irq_on_evtchn(vector);
83872 +       else
83873 +#endif /* CONFIG_XEN */
83874         platform_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
83875  }
83876  
83877 diff -ruNp linux-2.6.19/include/asm-ia64/hypercall.h linux-2.6.19-xen-3.0.4/include/asm-ia64/hypercall.h
83878 --- linux-2.6.19/include/asm-ia64/hypercall.h   1970-01-01 00:00:00.000000000 +0000
83879 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/hypercall.h 2007-02-02 19:10:55.000000000 +0000
83880 @@ -0,0 +1,463 @@
83881 +/******************************************************************************
83882 + * hypercall.h
83883 + * 
83884 + * Linux-specific hypervisor handling.
83885 + * 
83886 + * Copyright (c) 2002-2004, K A Fraser
83887 + * 
83888 + * This program is free software; you can redistribute it and/or
83889 + * modify it under the terms of the GNU General Public License version 2
83890 + * as published by the Free Software Foundation; or, when distributed
83891 + * separately from the Linux kernel or incorporated into other
83892 + * software packages, subject to the following license:
83893 + * 
83894 + * Permission is hereby granted, free of charge, to any person obtaining a copy
83895 + * of this source file (the "Software"), to deal in the Software without
83896 + * restriction, including without limitation the rights to use, copy, modify,
83897 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
83898 + * and to permit persons to whom the Software is furnished to do so, subject to
83899 + * the following conditions:
83900 + * 
83901 + * The above copyright notice and this permission notice shall be included in
83902 + * all copies or substantial portions of the Software.
83903 + * 
83904 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
83905 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
83906 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
83907 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
83908 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
83909 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
83910 + * IN THE SOFTWARE.
83911 + */
83912 +
83913 +#ifndef __HYPERCALL_H__
83914 +#define __HYPERCALL_H__
83915 +
83916 +#ifndef __HYPERVISOR_H__
83917 +# error "please don't include this file directly"
83918 +#endif
83919 +
83920 +#include <asm/xen/xcom_hcall.h>
83921 +struct xencomm_handle;
83922 +
83923 +/*
83924 + * Assembler stubs for hyper-calls.
83925 + */
83926 +
83927 +#define _hypercall0(type, name)                                        \
83928 +({                                                             \
83929 +       long __res;                                             \
83930 +       __asm__ __volatile__ (";;\n"                            \
83931 +                             "mov r2=%1\n"                     \
83932 +                             "break 0x1000 ;;\n"               \
83933 +                             "mov %0=r8 ;;\n"                  \
83934 +                             : "=r" (__res)                    \
83935 +                             : "J" (__HYPERVISOR_##name)       \
83936 +                             : "r2","r8",                      \
83937 +                               "memory" );                     \
83938 +       (type)__res;                                            \
83939 +})
83940 +
83941 +#define _hypercall1(type, name, a1)                            \
83942 +({                                                             \
83943 +       long __res;                                             \
83944 +       __asm__ __volatile__ (";;\n"                            \
83945 +                             "mov r14=%2\n"                    \
83946 +                             "mov r2=%1\n"                     \
83947 +                             "break 0x1000 ;;\n"               \
83948 +                             "mov %0=r8 ;;\n"                  \
83949 +                             : "=r" (__res)                    \
83950 +                             : "J" (__HYPERVISOR_##name),      \
83951 +                               "rI" ((unsigned long)(a1))      \
83952 +                             : "r14","r2","r8",                \
83953 +                               "memory" );                     \
83954 +       (type)__res;                                            \
83955 +})
83956 +
83957 +#define _hypercall2(type, name, a1, a2)                                \
83958 +({                                                             \
83959 +       long __res;                                             \
83960 +       __asm__ __volatile__ (";;\n"                            \
83961 +                             "mov r14=%2\n"                    \
83962 +                             "mov r15=%3\n"                    \
83963 +                             "mov r2=%1\n"                     \
83964 +                             "break 0x1000 ;;\n"               \
83965 +                             "mov %0=r8 ;;\n"                  \
83966 +                             : "=r" (__res)                    \
83967 +                             : "J" (__HYPERVISOR_##name),      \
83968 +                               "rI" ((unsigned long)(a1)),     \
83969 +                               "rI" ((unsigned long)(a2))      \
83970 +                             : "r14","r15","r2","r8",          \
83971 +                               "memory" );                     \
83972 +       (type)__res;                                            \
83973 +})
83974 +
83975 +#define _hypercall3(type, name, a1, a2, a3)                    \
83976 +({                                                             \
83977 +       long __res;                                             \
83978 +       __asm__ __volatile__ (";;\n"                            \
83979 +                             "mov r14=%2\n"                    \
83980 +                             "mov r15=%3\n"                    \
83981 +                             "mov r16=%4\n"                    \
83982 +                             "mov r2=%1\n"                     \
83983 +                             "break 0x1000 ;;\n"               \
83984 +                             "mov %0=r8 ;;\n"                  \
83985 +                             : "=r" (__res)                    \
83986 +                             : "J" (__HYPERVISOR_##name),      \
83987 +                               "rI" ((unsigned long)(a1)),     \
83988 +                               "rI" ((unsigned long)(a2)),     \
83989 +                               "rI" ((unsigned long)(a3))      \
83990 +                             : "r14","r15","r16","r2","r8",    \
83991 +                               "memory" );                     \
83992 +       (type)__res;                                            \
83993 +})
83994 +
83995 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
83996 +({                                                             \
83997 +       long __res;                                             \
83998 +       __asm__ __volatile__ (";;\n"                            \
83999 +                             "mov r14=%2\n"                    \
84000 +                             "mov r15=%3\n"                    \
84001 +                             "mov r16=%4\n"                    \
84002 +                             "mov r17=%5\n"                    \
84003 +                             "mov r2=%1\n"                     \
84004 +                             "break 0x1000 ;;\n"               \
84005 +                             "mov %0=r8 ;;\n"                  \
84006 +                             : "=r" (__res)                    \
84007 +                             : "J" (__HYPERVISOR_##name),      \
84008 +                               "rI" ((unsigned long)(a1)),     \
84009 +                               "rI" ((unsigned long)(a2)),     \
84010 +                               "rI" ((unsigned long)(a3)),     \
84011 +                               "rI" ((unsigned long)(a4))      \
84012 +                             : "r14","r15","r16","r2","r8",    \
84013 +                               "r17","memory" );               \
84014 +       (type)__res;                                            \
84015 +})
84016 +
84017 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
84018 +({                                                             \
84019 +       long __res;                                             \
84020 +       __asm__ __volatile__ (";;\n"                            \
84021 +                             "mov r14=%2\n"                    \
84022 +                             "mov r15=%3\n"                    \
84023 +                             "mov r16=%4\n"                    \
84024 +                             "mov r17=%5\n"                    \
84025 +                             "mov r18=%6\n"                    \
84026 +                             "mov r2=%1\n"                     \
84027 +                             "break 0x1000 ;;\n"               \
84028 +                             "mov %0=r8 ;;\n"                  \
84029 +                             : "=r" (__res)                    \
84030 +                             : "J" (__HYPERVISOR_##name),      \
84031 +                               "rI" ((unsigned long)(a1)),     \
84032 +                               "rI" ((unsigned long)(a2)),     \
84033 +                               "rI" ((unsigned long)(a3)),     \
84034 +                               "rI" ((unsigned long)(a4)),     \
84035 +                               "rI" ((unsigned long)(a5))      \
84036 +                             : "r14","r15","r16","r2","r8",    \
84037 +                               "r17","r18","memory" );         \
84038 +       (type)__res;                                            \
84039 +})
84040 +
84041 +
84042 +static inline int
84043 +xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg)
84044 +{
84045 +       return _hypercall2(int, sched_op, cmd, arg);
84046 +}
84047 +
84048 +static inline long
84049 +HYPERVISOR_set_timer_op(u64 timeout)
84050 +{
84051 +       unsigned long timeout_hi = (unsigned long)(timeout >> 32);
84052 +       unsigned long timeout_lo = (unsigned long)timeout;
84053 +       return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
84054 +}
84055 +
84056 +static inline int
84057 +xencomm_arch_hypercall_dom0_op(struct xencomm_handle *op)
84058 +{
84059 +       return _hypercall1(int, dom0_op, op);
84060 +}
84061 +
84062 +static inline int
84063 +xencomm_arch_hypercall_sysctl(struct xencomm_handle *op)
84064 +{
84065 +       return _hypercall1(int, sysctl, op);
84066 +}
84067 +
84068 +static inline int
84069 +xencomm_arch_hypercall_domctl(struct xencomm_handle *op)
84070 +{
84071 +       return _hypercall1(int, domctl, op);
84072 +}
84073 +
84074 +static inline int
84075 +xencomm_arch_hypercall_multicall(struct xencomm_handle *call_list,
84076 +                                int nr_calls)
84077 +{
84078 +       return _hypercall2(int, multicall, call_list, nr_calls);
84079 +}
84080 +
84081 +static inline int
84082 +xencomm_arch_hypercall_memory_op(unsigned int cmd, struct xencomm_handle *arg)
84083 +{
84084 +       return _hypercall2(int, memory_op, cmd, arg);
84085 +}
84086 +
84087 +static inline int
84088 +xencomm_arch_hypercall_event_channel_op(int cmd, struct xencomm_handle *arg)
84089 +{
84090 +       return _hypercall2(int, event_channel_op, cmd, arg);
84091 +}
84092 +
84093 +static inline int
84094 +xencomm_arch_hypercall_acm_op(unsigned int cmd, struct xencomm_handle *arg)
84095 +{
84096 +       return _hypercall2(int, acm_op, cmd, arg);
84097 +}
84098 +
84099 +static inline int
84100 +xencomm_arch_hypercall_xen_version(int cmd, struct xencomm_handle *arg)
84101 +{
84102 +       return _hypercall2(int, xen_version, cmd, arg);
84103 +}
84104 +
84105 +static inline int
84106 +xencomm_arch_hypercall_console_io(int cmd, int count,
84107 +                                  struct xencomm_handle *str)
84108 +{
84109 +       return _hypercall3(int, console_io, cmd, count, str);
84110 +}
84111 +
84112 +static inline int
84113 +xencomm_arch_hypercall_physdev_op(int cmd, struct xencomm_handle *arg)
84114 +{
84115 +       return _hypercall2(int, physdev_op, cmd, arg);
84116 +}
84117 +
84118 +static inline int
84119 +xencomm_arch_hypercall_grant_table_op(unsigned int cmd,
84120 +                                      struct xencomm_handle *uop,
84121 +                                      unsigned int count)
84122 +{
84123 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
84124 +}
84125 +
84126 +int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count);
84127 +
84128 +extern int xencomm_arch_hypercall_suspend(struct xencomm_handle *arg);
84129 +
84130 +static inline int
84131 +xencomm_arch_hypercall_callback_op(int cmd, struct xencomm_handle *arg)
84132 +{
84133 +       return _hypercall2(int, callback_op, cmd, arg);
84134 +}
84135 +
84136 +static inline unsigned long
84137 +xencomm_arch_hypercall_hvm_op(int cmd, void *arg)
84138 +{
84139 +       return _hypercall2(unsigned long, hvm_op, cmd, arg);
84140 +}
84141 +
84142 +static inline int
84143 +HYPERVISOR_physdev_op(int cmd, void *arg)
84144 +{
84145 +       switch (cmd) {
84146 +       case PHYSDEVOP_eoi:
84147 +               return _hypercall1(int, ia64_fast_eoi,
84148 +                                  ((struct physdev_eoi *)arg)->irq);
84149 +       default:
84150 +               return xencomm_hypercall_physdev_op(cmd, arg);
84151 +       }
84152 +}
84153 +
84154 +static inline int
84155 +xencomm_arch_hypercall_xenoprof_op(int op, struct xencomm_handle *arg)
84156 +{
84157 +       return _hypercall2(int, xenoprof_op, op, arg);
84158 +}
84159 +
84160 +extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
84161 +static inline void exit_idle(void) {}
84162 +#define do_IRQ(irq, regs) ({                   \
84163 +       irq_enter();                            \
84164 +       __do_IRQ((irq), (regs));                \
84165 +       irq_exit();                             \
84166 +})
84167 +
84168 +#include <linux/err.h>
84169 +#ifdef CONFIG_XEN
84170 +#include <asm/xen/privop.h>
84171 +#endif /* CONFIG_XEN */
84172 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
84173 +#include <xen/platform-compat.h>
84174 +#endif
84175 +
84176 +static inline unsigned long
84177 +__HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
84178 +{
84179 +       return _hypercall3(unsigned long, ia64_dom0vp_op,
84180 +                          IA64_DOM0VP_ioremap, ioaddr, size);
84181 +}
84182 +
84183 +static inline unsigned long
84184 +HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
84185 +{
84186 +       unsigned long ret = ioaddr;
84187 +       if (is_running_on_xen()) {
84188 +               ret = __HYPERVISOR_ioremap(ioaddr, size);
84189 +               if (unlikely(ret == -ENOSYS))
84190 +                       panic("hypercall %s failed with %ld. "
84191 +                             "Please check Xen and Linux config mismatch\n",
84192 +                             __func__, -ret);
84193 +               else if (unlikely(IS_ERR_VALUE(ret)))
84194 +                       ret = ioaddr;
84195 +       }
84196 +       return ret;
84197 +}
84198 +
84199 +static inline unsigned long
84200 +__HYPERVISOR_phystomach(unsigned long gpfn)
84201 +{
84202 +       return _hypercall2(unsigned long, ia64_dom0vp_op,
84203 +                          IA64_DOM0VP_phystomach, gpfn);
84204 +}
84205 +
84206 +static inline unsigned long
84207 +HYPERVISOR_phystomach(unsigned long gpfn)
84208 +{
84209 +       unsigned long ret = gpfn;
84210 +       if (is_running_on_xen()) {
84211 +               ret = __HYPERVISOR_phystomach(gpfn);
84212 +       }
84213 +       return ret;
84214 +}
84215 +
84216 +static inline unsigned long
84217 +__HYPERVISOR_machtophys(unsigned long mfn)
84218 +{
84219 +       return _hypercall2(unsigned long, ia64_dom0vp_op,
84220 +                          IA64_DOM0VP_machtophys, mfn);
84221 +}
84222 +
84223 +static inline unsigned long
84224 +HYPERVISOR_machtophys(unsigned long mfn)
84225 +{
84226 +       unsigned long ret = mfn;
84227 +       if (is_running_on_xen()) {
84228 +               ret = __HYPERVISOR_machtophys(mfn);
84229 +       }
84230 +       return ret;
84231 +}
84232 +
84233 +static inline unsigned long
84234 +__HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
84235 +{
84236 +       return _hypercall3(unsigned long, ia64_dom0vp_op,
84237 +                          IA64_DOM0VP_zap_physmap, gpfn, extent_order);
84238 +}
84239 +
84240 +static inline unsigned long
84241 +HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
84242 +{
84243 +       unsigned long ret = 0;
84244 +       if (is_running_on_xen()) {
84245 +               ret = __HYPERVISOR_zap_physmap(gpfn, extent_order);
84246 +       }
84247 +       return ret;
84248 +}
84249 +
84250 +static inline unsigned long
84251 +__HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
84252 +                        unsigned long flags, domid_t domid)
84253 +{
84254 +       return _hypercall5(unsigned long, ia64_dom0vp_op,
84255 +                          IA64_DOM0VP_add_physmap, gpfn, mfn, flags, domid);
84256 +}
84257 +
84258 +static inline unsigned long
84259 +HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
84260 +                      unsigned long flags, domid_t domid)
84261 +{
84262 +       unsigned long ret = 0;
84263 +       BUG_ON(!is_running_on_xen());//XXX
84264 +       if (is_running_on_xen()) {
84265 +               ret = __HYPERVISOR_add_physmap(gpfn, mfn, flags, domid);
84266 +       }
84267 +       return ret;
84268 +}
84269 +
84270 +static inline unsigned long
84271 +__HYPERVISOR_add_physmap_with_gmfn(unsigned long gpfn, unsigned long gmfn,
84272 +                                   unsigned long flags, domid_t domid)
84273 +{
84274 +       return _hypercall5(unsigned long, ia64_dom0vp_op,
84275 +                          IA64_DOM0VP_add_physmap_with_gmfn,
84276 +                          gpfn, gmfn, flags, domid);
84277 +}
84278 +
84279 +static inline unsigned long
84280 +HYPERVISOR_add_physmap_with_gmfn(unsigned long gpfn, unsigned long gmfn,
84281 +                                unsigned long flags, domid_t domid)
84282 +{
84283 +       unsigned long ret = 0;
84284 +       BUG_ON(!is_running_on_xen());//XXX
84285 +       if (is_running_on_xen()) {
84286 +               ret = __HYPERVISOR_add_physmap_with_gmfn(gpfn, gmfn,
84287 +                                                        flags, domid);
84288 +       }
84289 +       return ret;
84290 +}
84291 +
84292 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
84293 +static inline unsigned long
84294 +HYPERVISOR_expose_p2m(unsigned long conv_start_gpfn,
84295 +                      unsigned long assign_start_gpfn,
84296 +                      unsigned long expose_size, unsigned long granule_pfn)
84297 +{
84298 +       return _hypercall5(unsigned long, ia64_dom0vp_op,
84299 +                          IA64_DOM0VP_expose_p2m, conv_start_gpfn,
84300 +                          assign_start_gpfn, expose_size, granule_pfn);
84301 +}
84302 +#endif
84303 +
84304 +static inline int
84305 +xencomm_arch_hypercall_perfmon_op(unsigned long cmd,
84306 +                                  struct xencomm_handle *arg,
84307 +                                  unsigned long count)
84308 +{
84309 +       return _hypercall4(int, ia64_dom0vp_op,
84310 +                          IA64_DOM0VP_perfmon, cmd, arg, count);
84311 +}
84312 +
84313 +// for balloon driver
84314 +#define HYPERVISOR_update_va_mapping(va, new_val, flags) (0)
84315 +
84316 +/* Use xencomm to do hypercalls.  */
84317 +#ifdef MODULE
84318 +#define HYPERVISOR_sched_op xencomm_mini_hypercall_sched_op
84319 +#define HYPERVISOR_event_channel_op xencomm_mini_hypercall_event_channel_op
84320 +#define HYPERVISOR_callback_op xencomm_mini_hypercall_callback_op
84321 +#define HYPERVISOR_multicall xencomm_mini_hypercall_multicall
84322 +#define HYPERVISOR_xen_version xencomm_mini_hypercall_xen_version
84323 +#define HYPERVISOR_console_io xencomm_mini_hypercall_console_io
84324 +#define HYPERVISOR_hvm_op xencomm_mini_hypercall_hvm_op
84325 +#define HYPERVISOR_memory_op xencomm_mini_hypercall_memory_op
84326 +#define HYPERVISOR_xenoprof_op xencomm_mini_hypercall_xenoprof_op
84327 +#define HYPERVISOR_perfmon_op xencomm_mini_hypercall_perfmon_op
84328 +#else
84329 +#define HYPERVISOR_sched_op xencomm_hypercall_sched_op
84330 +#define HYPERVISOR_event_channel_op xencomm_hypercall_event_channel_op
84331 +#define HYPERVISOR_callback_op xencomm_hypercall_callback_op
84332 +#define HYPERVISOR_multicall xencomm_hypercall_multicall
84333 +#define HYPERVISOR_xen_version xencomm_hypercall_xen_version
84334 +#define HYPERVISOR_console_io xencomm_hypercall_console_io
84335 +#define HYPERVISOR_hvm_op xencomm_hypercall_hvm_op
84336 +#define HYPERVISOR_memory_op xencomm_hypercall_memory_op
84337 +#define HYPERVISOR_xenoprof_op xencomm_hypercall_xenoprof_op
84338 +#define HYPERVISOR_perfmon_op xencomm_hypercall_perfmon_op
84339 +#endif
84340 +
84341 +#define HYPERVISOR_suspend xencomm_hypercall_suspend
84342 +
84343 +#endif /* __HYPERCALL_H__ */
84344 diff -ruNp linux-2.6.19/include/asm-ia64/hypervisor.h linux-2.6.19-xen-3.0.4/include/asm-ia64/hypervisor.h
84345 --- linux-2.6.19/include/asm-ia64/hypervisor.h  1970-01-01 00:00:00.000000000 +0000
84346 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/hypervisor.h        2007-02-02 19:10:55.000000000 +0000
84347 @@ -0,0 +1,224 @@
84348 +/******************************************************************************
84349 + * hypervisor.h
84350 + * 
84351 + * Linux-specific hypervisor handling.
84352 + * 
84353 + * Copyright (c) 2002-2004, K A Fraser
84354 + * 
84355 + * This program is free software; you can redistribute it and/or
84356 + * modify it under the terms of the GNU General Public License version 2
84357 + * as published by the Free Software Foundation; or, when distributed
84358 + * separately from the Linux kernel or incorporated into other
84359 + * software packages, subject to the following license:
84360 + * 
84361 + * Permission is hereby granted, free of charge, to any person obtaining a copy
84362 + * of this source file (the "Software"), to deal in the Software without
84363 + * restriction, including without limitation the rights to use, copy, modify,
84364 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
84365 + * and to permit persons to whom the Software is furnished to do so, subject to
84366 + * the following conditions:
84367 + * 
84368 + * The above copyright notice and this permission notice shall be included in
84369 + * all copies or substantial portions of the Software.
84370 + * 
84371 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
84372 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
84373 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
84374 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
84375 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
84376 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
84377 + * IN THE SOFTWARE.
84378 + */
84379 +
84380 +#ifndef __HYPERVISOR_H__
84381 +#define __HYPERVISOR_H__
84382 +
84383 +#ifdef CONFIG_XEN
84384 +extern int running_on_xen;
84385 +#define is_running_on_xen()                    (running_on_xen)
84386 +#else /* CONFIG_XEN */
84387 +# ifdef CONFIG_VMX_GUEST
84388 +#  define is_running_on_xen()                  (1)
84389 +# else /* CONFIG_VMX_GUEST */
84390 +#  define is_running_on_xen()                  (0)
84391 +#  define HYPERVISOR_ioremap(offset, size)     (offset)
84392 +# endif /* CONFIG_VMX_GUEST */
84393 +#endif /* CONFIG_XEN */
84394 +
84395 +#if defined(CONFIG_XEN) || defined(CONFIG_VMX_GUEST)
84396 +#include <linux/types.h>
84397 +#include <linux/kernel.h>
84398 +#include <linux/version.h>
84399 +#include <linux/errno.h>
84400 +#include <xen/interface/xen.h>
84401 +#include <xen/interface/dom0_ops.h>
84402 +#include <xen/interface/event_channel.h>
84403 +#include <xen/interface/physdev.h>
84404 +#include <xen/interface/sched.h>
84405 +#include <asm/hypercall.h>
84406 +#include <asm/ptrace.h>
84407 +#include <asm/page.h>
84408 +
84409 +extern shared_info_t *HYPERVISOR_shared_info;
84410 +extern start_info_t *xen_start_info;
84411 +
84412 +void force_evtchn_callback(void);
84413 +
84414 +#ifndef CONFIG_VMX_GUEST
84415 +/* Turn jiffies into Xen system time. XXX Implement me. */
84416 +#define jiffies_to_st(j)       0
84417 +
84418 +static inline int
84419 +HYPERVISOR_yield(
84420 +       void)
84421 +{
84422 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
84423 +
84424 +       return rc;
84425 +}
84426 +
84427 +static inline int
84428 +HYPERVISOR_block(
84429 +       void)
84430 +{
84431 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
84432 +
84433 +       return rc;
84434 +}
84435 +
84436 +static inline int
84437 +HYPERVISOR_shutdown(
84438 +       unsigned int reason)
84439 +{
84440 +       struct sched_shutdown sched_shutdown = {
84441 +               .reason = reason
84442 +       };
84443 +
84444 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
84445 +
84446 +       return rc;
84447 +}
84448 +
84449 +static inline int
84450 +HYPERVISOR_poll(
84451 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
84452 +{
84453 +       struct sched_poll sched_poll = {
84454 +               .nr_ports = nr_ports,
84455 +               .timeout = jiffies_to_st(timeout)
84456 +       };
84457 +
84458 +       int rc;
84459 +
84460 +       set_xen_guest_handle(sched_poll.ports, ports);
84461 +       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
84462 +
84463 +       return rc;
84464 +}
84465 +
84466 +#include <asm/hypercall.h>
84467 +
84468 +// for drivers/xen/privcmd/privcmd.c
84469 +#define machine_to_phys_mapping 0
84470 +struct vm_area_struct;
84471 +int direct_remap_pfn_range(struct vm_area_struct *vma,
84472 +                          unsigned long address,
84473 +                          unsigned long mfn,
84474 +                          unsigned long size,
84475 +                          pgprot_t prot,
84476 +                          domid_t  domid);
84477 +struct file;
84478 +int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
84479 +int privcmd_mmap(struct file * file, struct vm_area_struct * vma);
84480 +#define HAVE_ARCH_PRIVCMD_MMAP
84481 +
84482 +// for drivers/xen/balloon/balloon.c
84483 +#ifdef CONFIG_XEN_SCRUB_PAGES
84484 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
84485 +#else
84486 +#define scrub_pages(_p,_n) ((void)0)
84487 +#endif
84488 +#define        pte_mfn(_x)     pte_pfn(_x)
84489 +#define phys_to_machine_mapping_valid(_x)      (1)
84490 +
84491 +#endif /* !CONFIG_VMX_GUEST */
84492 +
84493 +#define __pte_ma(_x)   ((pte_t) {(_x)})        /* unmodified use */
84494 +#define pfn_pte_ma(_x,_y)      __pte_ma(0)     /* unmodified use */
84495 +
84496 +#ifndef CONFIG_VMX_GUEST
84497 +int __xen_create_contiguous_region(unsigned long vstart, unsigned int order, unsigned int address_bits);
84498 +static inline int
84499 +xen_create_contiguous_region(unsigned long vstart,
84500 +                             unsigned int order, unsigned int address_bits)
84501 +{
84502 +       int ret = 0;
84503 +       if (is_running_on_xen()) {
84504 +               ret = __xen_create_contiguous_region(vstart, order,
84505 +                                                    address_bits);
84506 +       }
84507 +       return ret;
84508 +}
84509 +
84510 +void __xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
84511 +static inline void
84512 +xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
84513 +{
84514 +       if (is_running_on_xen())
84515 +               __xen_destroy_contiguous_region(vstart, order);
84516 +}
84517 +
84518 +#endif /* !CONFIG_VMX_GUEST */
84519 +
84520 +// for netfront.c, netback.c
84521 +#define MULTI_UVMFLAGS_INDEX 0 //XXX any value
84522 +
84523 +static inline void
84524 +MULTI_update_va_mapping(
84525 +       multicall_entry_t *mcl, unsigned long va,
84526 +       pte_t new_val, unsigned long flags)
84527 +{
84528 +       mcl->op = __HYPERVISOR_update_va_mapping;
84529 +       mcl->result = 0;
84530 +}
84531 +
84532 +static inline void
84533 +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
84534 +       void *uop, unsigned int count)
84535 +{
84536 +       mcl->op = __HYPERVISOR_grant_table_op;
84537 +       mcl->args[0] = cmd;
84538 +       mcl->args[1] = (unsigned long)uop;
84539 +       mcl->args[2] = count;
84540 +}
84541 +
84542 +/*
84543 + * for blktap.c
84544 + * int create_lookup_pte_addr(struct mm_struct *mm, 
84545 + *                            unsigned long address,
84546 + *                            uint64_t *ptep);
84547 + */
84548 +#define create_lookup_pte_addr(mm, address, ptep)                      \
84549 +       ({                                                              \
84550 +               printk(KERN_EMERG                                       \
84551 +                      "%s:%d "                                         \
84552 +                      "create_lookup_pte_addr() isn't supported.\n",   \
84553 +                      __func__, __LINE__);                             \
84554 +               BUG();                                                  \
84555 +               (-ENOSYS);                                              \
84556 +       })
84557 +
84558 +// for debug
84559 +asmlinkage int xprintk(const char *fmt, ...);
84560 +#define xprintd(fmt, ...)      xprintk("%s:%d " fmt, __func__, __LINE__, \
84561 +                                       ##__VA_ARGS__)
84562 +
84563 +#endif /* CONFIG_XEN || CONFIG_VMX_GUEST */
84564 +
84565 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
84566 +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
84567 +#else
84568 +#define is_initial_xendomain() 0
84569 +#endif
84570 +
84571 +#endif /* __HYPERVISOR_H__ */
84572 diff -ruNp linux-2.6.19/include/asm-ia64/intel_intrin.h linux-2.6.19-xen-3.0.4/include/asm-ia64/intel_intrin.h
84573 --- linux-2.6.19/include/asm-ia64/intel_intrin.h        2006-11-29 21:57:37.000000000 +0000
84574 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/intel_intrin.h      2007-02-02 19:10:55.000000000 +0000
84575 @@ -16,8 +16,10 @@
84576                          * intrinsic
84577                          */
84578  
84579 -#define ia64_getreg            __getReg
84580 -#define ia64_setreg            __setReg
84581 +#define __ia64_getreg          __getReg
84582 +#define __ia64_setreg          __setReg
84583 +
84584 +#define __ia64_hint(x)
84585  
84586  #define ia64_hint              __hint
84587  #define ia64_hint_pause                __hint_pause
84588 @@ -33,16 +35,16 @@
84589  #define ia64_getf_exp          __getf_exp
84590  #define ia64_shrp              _m64_shrp
84591  
84592 -#define ia64_tpa               __tpa
84593 +#define __ia64_tpa             __tpa
84594  #define ia64_invala            __invala
84595  #define ia64_invala_gr         __invala_gr
84596  #define ia64_invala_fr         __invala_fr
84597  #define ia64_nop               __nop
84598  #define ia64_sum               __sum
84599 -#define ia64_ssm               __ssm
84600 +#define __ia64_ssm             __ssm
84601  #define ia64_rum               __rum
84602 -#define ia64_rsm               __rsm
84603 -#define ia64_fc                __fc
84604 +#define __ia64_rsm             __rsm
84605 +#define __ia64_fc              __fc
84606  
84607  #define ia64_ldfs              __ldfs
84608  #define ia64_ldfd              __ldfd
84609 @@ -80,24 +82,24 @@
84610  
84611  #define __ia64_set_dbr(index, val)     \
84612                 __setIndReg(_IA64_REG_INDR_DBR, index, val)
84613 -#define ia64_set_ibr(index, val)       \
84614 +#define __ia64_set_ibr(index, val)     \
84615                 __setIndReg(_IA64_REG_INDR_IBR, index, val)
84616 -#define ia64_set_pkr(index, val)       \
84617 +#define __ia64_set_pkr(index, val)     \
84618                 __setIndReg(_IA64_REG_INDR_PKR, index, val)
84619 -#define ia64_set_pmc(index, val)       \
84620 +#define __ia64_set_pmc(index, val)     \
84621                 __setIndReg(_IA64_REG_INDR_PMC, index, val)
84622 -#define ia64_set_pmd(index, val)       \
84623 +#define __ia64_set_pmd(index, val)     \
84624                 __setIndReg(_IA64_REG_INDR_PMD, index, val)
84625 -#define ia64_set_rr(index, val)        \
84626 +#define __ia64_set_rr(index, val)      \
84627                 __setIndReg(_IA64_REG_INDR_RR, index, val)
84628  
84629 -#define ia64_get_cpuid(index)  __getIndReg(_IA64_REG_INDR_CPUID, index)
84630 +#define __ia64_get_cpuid(index)        __getIndReg(_IA64_REG_INDR_CPUID, index)
84631  #define __ia64_get_dbr(index)  __getIndReg(_IA64_REG_INDR_DBR, index)
84632 -#define ia64_get_ibr(index)    __getIndReg(_IA64_REG_INDR_IBR, index)
84633 -#define ia64_get_pkr(index)    __getIndReg(_IA64_REG_INDR_PKR, index)
84634 -#define ia64_get_pmc(index)    __getIndReg(_IA64_REG_INDR_PMC, index)
84635 -#define ia64_get_pmd(index)    __getIndReg(_IA64_REG_INDR_PMD, index)
84636 -#define ia64_get_rr(index)     __getIndReg(_IA64_REG_INDR_RR, index)
84637 +#define __ia64_get_ibr(index)  __getIndReg(_IA64_REG_INDR_IBR, index)
84638 +#define __ia64_get_pkr(index)  __getIndReg(_IA64_REG_INDR_PKR, index)
84639 +#define __ia64_get_pmc(index)  __getIndReg(_IA64_REG_INDR_PMC, index)
84640 +#define __ia64_get_pmd(index)          __getIndReg(_IA64_REG_INDR_PMD, index)
84641 +#define __ia64_get_rr(index)   __getIndReg(_IA64_REG_INDR_RR, index)
84642  
84643  #define ia64_srlz_d            __dsrlz
84644  #define ia64_srlz_i            __isrlz
84645 @@ -116,18 +118,18 @@
84646  #define ia64_ld8_acq           __ld8_acq
84647  
84648  #define ia64_sync_i            __synci
84649 -#define ia64_thash             __thash
84650 -#define ia64_ttag              __ttag
84651 -#define ia64_itcd              __itcd
84652 -#define ia64_itci              __itci
84653 -#define ia64_itrd              __itrd
84654 -#define ia64_itri              __itri
84655 -#define ia64_ptce              __ptce
84656 -#define ia64_ptcl              __ptcl
84657 -#define ia64_ptcg              __ptcg
84658 -#define ia64_ptcga             __ptcga
84659 -#define ia64_ptri              __ptri
84660 -#define ia64_ptrd              __ptrd
84661 +#define __ia64_thash           __thash
84662 +#define __ia64_ttag            __ttag
84663 +#define __ia64_itcd            __itcd
84664 +#define __ia64_itci            __itci
84665 +#define __ia64_itrd            __itrd
84666 +#define __ia64_itri            __itri
84667 +#define __ia64_ptce            __ptce
84668 +#define __ia64_ptcl            __ptcl
84669 +#define __ia64_ptcg            __ptcg
84670 +#define __ia64_ptcga           __ptcga
84671 +#define __ia64_ptri            __ptri
84672 +#define __ia64_ptrd            __ptrd
84673  #define ia64_dep_mi            _m64_dep_mi
84674  
84675  /* Values for lfhint in __lfetch and __lfetch_fault */
84676 @@ -142,16 +144,18 @@
84677  #define ia64_lfetch_fault      __lfetch_fault
84678  #define ia64_lfetch_fault_excl __lfetch_fault_excl
84679  
84680 -#define ia64_intrin_local_irq_restore(x)               \
84681 +#define __ia64_intrin_local_irq_restore(x)             \
84682  do {                                                   \
84683         if ((x) != 0) {                                 \
84684 -               ia64_ssm(IA64_PSR_I);                   \
84685 +               __ia64_ssm(IA64_PSR_I);                 \
84686                 ia64_srlz_d();                          \
84687         } else {                                        \
84688 -               ia64_rsm(IA64_PSR_I);                   \
84689 +               __ia64_rsm(IA64_PSR_I);                 \
84690         }                                               \
84691  } while (0)
84692  
84693 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
84694 +
84695  #define __builtin_trap()       __break(0);
84696  
84697  #endif /* _ASM_IA64_INTEL_INTRIN_H */
84698 diff -ruNp linux-2.6.19/include/asm-ia64/io.h linux-2.6.19-xen-3.0.4/include/asm-ia64/io.h
84699 --- linux-2.6.19/include/asm-ia64/io.h  2006-11-29 21:57:37.000000000 +0000
84700 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/io.h        2007-02-02 19:10:55.000000000 +0000
84701 @@ -66,9 +66,11 @@ extern unsigned int num_io_spaces;
84702  #define PIO_RESERVED           __IA64_UNCACHED_OFFSET
84703  #define HAVE_ARCH_PIO_SIZE
84704  
84705 +#include <asm/hypervisor.h>
84706  #include <asm/intrinsics.h>
84707  #include <asm/machvec.h>
84708  #include <asm/page.h>
84709 +#include <asm/privop.h>
84710  #include <asm/system.h>
84711  #include <asm-generic/iomap.h>
84712  
84713 @@ -96,9 +98,46 @@ extern int valid_mmap_phys_addr_range (u
84714   * The following two macros are deprecated and scheduled for removal.
84715   * Please use the PCI-DMA interface defined in <asm/pci.h> instead.
84716   */
84717 +#ifndef CONFIG_XEN
84718  #define bus_to_virt    phys_to_virt
84719  #define virt_to_bus    virt_to_phys
84720  #define page_to_bus    page_to_phys
84721 +#define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
84722 +#define page_to_pseudophys(page)       page_to_phys(page)
84723 +#else /* CONFIG_XEN */
84724 +#define bus_to_virt(bus)       \
84725 +       phys_to_virt(machine_to_phys_for_dma(bus))
84726 +#define virt_to_bus(virt)      \
84727 +       phys_to_machine_for_dma(virt_to_phys(virt))
84728 +#define page_to_bus(page)      \
84729 +       phys_to_machine_for_dma(page_to_pseudophys(page))
84730 +
84731 +#define page_to_pseudophys(page) \
84732 +       ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
84733 +
84734 +/*
84735 + * Drivers that use page_to_phys() for bus addresses are broken.
84736 + * This includes:
84737 + * drivers/ide/cris/ide-cris.c
84738 + * drivers/scsi/dec_esp.c
84739 + */
84740 +#define page_to_phys(page)     (page_to_pseudophys(page))
84741 +#define bvec_to_bus(bv)                (page_to_bus((bv)->bv_page) + \
84742 +                               (unsigned long) (bv)->bv_offset)
84743 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) +  \
84744 +                                (unsigned long) bio_offset((bio)))
84745 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) +   \
84746 +                                (unsigned long) (bv)->bv_offset)
84747 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
84748 +       (((bvec_to_bus((vec1)) + (vec1)->bv_len) == bvec_to_bus((vec2))) && \
84749 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) ==              \
84750 +         bvec_to_pseudophys((vec2))))
84751 +
84752 +/* We will be supplying our own /dev/mem implementation */
84753 +#define ARCH_HAS_DEV_MEM
84754 +#define ARCH_HAS_DEV_MEM_MMAP_MEM
84755 +int xen_mmap_mem(struct file * file, struct vm_area_struct * vma);
84756 +#endif /* CONFIG_XEN */
84757  
84758  # endif /* KERNEL */
84759  
84760 @@ -418,9 +457,20 @@ __writeq (unsigned long val, volatile vo
84761  #endif
84762  
84763  # ifdef __KERNEL__
84764 -
84765 -extern void __iomem * ioremap(unsigned long offset, unsigned long size);
84766 -extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
84767 +/*
84768 + * An "address" in IO memory space is not clearly either an integer or a pointer. We will
84769 + * accept both, thus the casts.
84770 + *
84771 + * On ia-64, we access the physical I/O memory space through the uncached kernel region.
84772 + */
84773 +static inline void __iomem *
84774 +ioremap (unsigned long offset, unsigned long size)
84775 +{
84776 +       offset = HYPERVISOR_ioremap(offset, size);
84777 +       if (IS_ERR_VALUE(offset))
84778 +               return (void __iomem*)offset;
84779 +       return (void __iomem *) (__IA64_UNCACHED_OFFSET | (offset));
84780 +}
84781  
84782  static inline void
84783  iounmap (volatile void __iomem *addr)
84784 diff -ruNp linux-2.6.19/include/asm-ia64/iosapic.h linux-2.6.19-xen-3.0.4/include/asm-ia64/iosapic.h
84785 --- linux-2.6.19/include/asm-ia64/iosapic.h     2006-11-29 21:57:37.000000000 +0000
84786 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/iosapic.h   2007-02-02 19:10:55.000000000 +0000
84787 @@ -53,6 +53,7 @@
84788  
84789  #define NR_IOSAPICS                    256
84790  
84791 +#ifndef CONFIG_XEN
84792  static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
84793  {
84794         writel(reg, iosapic + IOSAPIC_REG_SELECT);
84795 @@ -64,6 +65,7 @@ static inline void iosapic_write(char __
84796         writel(reg, iosapic + IOSAPIC_REG_SELECT);
84797         writel(val, iosapic + IOSAPIC_WINDOW);
84798  }
84799 +#endif
84800  
84801  static inline void iosapic_eoi(char __iomem *iosapic, u32 vector)
84802  {
84803 diff -ruNp linux-2.6.19/include/asm-ia64/irq.h linux-2.6.19-xen-3.0.4/include/asm-ia64/irq.h
84804 --- linux-2.6.19/include/asm-ia64/irq.h 2006-11-29 21:57:37.000000000 +0000
84805 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/irq.h       2007-02-02 19:10:55.000000000 +0000
84806 @@ -11,8 +11,41 @@
84807   * 02/29/00     D.Mosberger    moved most things into hw_irq.h
84808   */
84809  
84810 +#ifndef CONFIG_XEN
84811  #define NR_IRQS                256
84812  #define NR_IRQ_VECTORS NR_IRQS
84813 +#else
84814 +/*
84815 + * The flat IRQ space is divided into two regions:
84816 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
84817 + *     if we have physical device-access privilege. This region is at the 
84818 + *     start of the IRQ space so that existing device drivers do not need
84819 + *     to be modified to translate physical IRQ numbers into our IRQ space.
84820 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
84821 + *     are bound using the provided bind/unbind functions.
84822 + */
84823 +
84824 +#define PIRQ_BASE              0
84825 +#define NR_PIRQS               256
84826 +
84827 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
84828 +#define NR_DYNIRQS             256
84829 +
84830 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
84831 +#define NR_IRQ_VECTORS         NR_IRQS
84832 +
84833 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
84834 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
84835 +
84836 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
84837 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
84838 +
84839 +#define RESCHEDULE_VECTOR      0
84840 +#define IPI_VECTOR             1
84841 +#define CMCP_VECTOR            2
84842 +#define CPEP_VECTOR            3
84843 +#define NR_IPIS                        4
84844 +#endif /* CONFIG_XEN */
84845  
84846  static __inline__ int
84847  irq_canonicalize (int irq)
84848 diff -ruNp linux-2.6.19/include/asm-ia64/machvec_dig.h linux-2.6.19-xen-3.0.4/include/asm-ia64/machvec_dig.h
84849 --- linux-2.6.19/include/asm-ia64/machvec_dig.h 2006-11-29 21:57:37.000000000 +0000
84850 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/machvec_dig.h       2007-02-02 19:10:55.000000000 +0000
84851 @@ -13,4 +13,19 @@ extern ia64_mv_setup_t dig_setup;
84852  #define platform_name          "dig"
84853  #define platform_setup         dig_setup
84854  
84855 +#ifdef CONFIG_XEN
84856 +# define platform_dma_map_sg           dma_map_sg
84857 +# define platform_dma_unmap_sg         dma_unmap_sg
84858 +# define platform_dma_mapping_error    dma_mapping_error
84859 +# define platform_dma_supported                dma_supported
84860 +# define platform_dma_alloc_coherent   dma_alloc_coherent
84861 +# define platform_dma_free_coherent    dma_free_coherent
84862 +# define platform_dma_map_single       dma_map_single
84863 +# define platform_dma_unmap_single     dma_unmap_single
84864 +# define platform_dma_sync_single_for_cpu \
84865 +                                       dma_sync_single_for_cpu
84866 +# define platform_dma_sync_single_for_device \
84867 +                                       dma_sync_single_for_device
84868 +#endif
84869 +
84870  #endif /* _ASM_IA64_MACHVEC_DIG_h */
84871 diff -ruNp linux-2.6.19/include/asm-ia64/maddr.h linux-2.6.19-xen-3.0.4/include/asm-ia64/maddr.h
84872 --- linux-2.6.19/include/asm-ia64/maddr.h       1970-01-01 00:00:00.000000000 +0000
84873 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/maddr.h     2007-02-02 19:10:55.000000000 +0000
84874 @@ -0,0 +1,102 @@
84875 +#ifndef _ASM_IA64_MADDR_H
84876 +#define _ASM_IA64_MADDR_H
84877 +
84878 +#include <linux/kernel.h>
84879 +#include <asm/hypervisor.h>
84880 +#include <xen/features.h>
84881 +#include <xen/interface/xen.h>
84882 +
84883 +#ifdef CONFIG_XEN
84884 +
84885 +#define INVALID_P2M_ENTRY       (~0UL)
84886 +
84887 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
84888 +extern int p2m_initialized;
84889 +extern unsigned long p2m_min_low_pfn;
84890 +extern unsigned long p2m_max_low_pfn;
84891 +extern unsigned long p2m_convert_min_pfn;
84892 +extern unsigned long p2m_convert_max_pfn;
84893 +extern volatile const pte_t* p2m_pte;
84894 +unsigned long p2m_phystomach(unsigned long gpfn);
84895 +#else
84896 +#define p2m_initialized                (0)
84897 +#define p2m_phystomach(gpfn)   INVALID_MFN
84898 +#endif
84899 +
84900 +/* XXX xen page size != page size */
84901 +static inline unsigned long
84902 +pfn_to_mfn_for_dma(unsigned long pfn)
84903 +{
84904 +       unsigned long mfn;
84905 +       if (p2m_initialized)
84906 +               return p2m_phystomach(pfn);
84907 +       mfn = HYPERVISOR_phystomach(pfn);
84908 +       BUG_ON(mfn == 0); // XXX
84909 +       BUG_ON(mfn == INVALID_P2M_ENTRY); // XXX
84910 +       BUG_ON(mfn == INVALID_MFN);
84911 +       return mfn;
84912 +}
84913 +
84914 +static inline unsigned long
84915 +phys_to_machine_for_dma(unsigned long phys)
84916 +{
84917 +       unsigned long machine =
84918 +                     pfn_to_mfn_for_dma(phys >> PAGE_SHIFT) << PAGE_SHIFT;
84919 +       machine |= (phys & ~PAGE_MASK);
84920 +       return machine;
84921 +}
84922 +
84923 +static inline unsigned long
84924 +mfn_to_pfn_for_dma(unsigned long mfn)
84925 +{
84926 +       unsigned long pfn;
84927 +       pfn = HYPERVISOR_machtophys(mfn);
84928 +       BUG_ON(pfn == 0);
84929 +       //BUG_ON(pfn == INVALID_M2P_ENTRY);
84930 +       return pfn;
84931 +}
84932 +
84933 +static inline unsigned long
84934 +machine_to_phys_for_dma(unsigned long machine)
84935 +{
84936 +       unsigned long phys =
84937 +                     mfn_to_pfn_for_dma(machine >> PAGE_SHIFT) << PAGE_SHIFT;
84938 +       phys |= (machine & ~PAGE_MASK);
84939 +       return phys;
84940 +}
84941 +
84942 +static inline unsigned long
84943 +mfn_to_local_pfn(unsigned long mfn)
84944 +{
84945 +       extern unsigned long max_mapnr;
84946 +       unsigned long pfn = mfn_to_pfn_for_dma(mfn);
84947 +       if (!pfn_valid(pfn))
84948 +               return INVALID_P2M_ENTRY;
84949 +       return pfn;
84950 +}
84951 +
84952 +#else /* !CONFIG_XEN */
84953 +
84954 +#define pfn_to_mfn_for_dma(pfn) (pfn)
84955 +#define mfn_to_pfn_for_dma(mfn) (mfn)
84956 +#define phys_to_machine_for_dma(phys) (phys)
84957 +#define machine_to_phys_for_dma(machine) (machine)
84958 +#define mfn_to_local_pfn(mfn) (mfn)
84959 +
84960 +#endif /* !CONFIG_XEN */
84961 +
84962 +/* XXX to compile set_phys_to_machine(vaddr, FOREIGN_FRAME(m)) */
84963 +#define FOREIGN_FRAME(m)        (INVALID_P2M_ENTRY)
84964 +
84965 +#define mfn_to_pfn(mfn) (mfn)
84966 +#define pfn_to_mfn(pfn) (pfn)
84967 +
84968 +#define mfn_to_virt(mfn) (__va((mfn) << PAGE_SHIFT))
84969 +#define virt_to_mfn(virt) (__pa(virt) >> PAGE_SHIFT)
84970 +#define virt_to_machine(virt) __pa(virt) // for tpmfront.c
84971 +
84972 +#define set_phys_to_machine(pfn, mfn) do { } while (0)
84973 +
84974 +typedef unsigned long maddr_t; // to compile netback, netfront
84975 +
84976 +#endif /* _ASM_IA64_MADDR_H */
84977 diff -ruNp linux-2.6.19/include/asm-ia64/meminit.h linux-2.6.19-xen-3.0.4/include/asm-ia64/meminit.h
84978 --- linux-2.6.19/include/asm-ia64/meminit.h     2006-11-29 21:57:37.000000000 +0000
84979 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/meminit.h   2007-02-02 19:10:55.000000000 +0000
84980 @@ -16,10 +16,15 @@
84981   *     - command line string
84982   *     - kernel code & data
84983   *     - Kernel memory map built from EFI memory map
84984 + *     - xen start info
84985   *
84986   * More could be added if necessary
84987   */
84988 +#ifndef CONFIG_XEN
84989  #define IA64_MAX_RSVD_REGIONS 6
84990 +#else
84991 +#define IA64_MAX_RSVD_REGIONS 7
84992 +#endif
84993  
84994  struct rsvd_region {
84995         unsigned long start;    /* virtual address of beginning of element */
84996 diff -ruNp linux-2.6.19/include/asm-ia64/page.h linux-2.6.19-xen-3.0.4/include/asm-ia64/page.h
84997 --- linux-2.6.19/include/asm-ia64/page.h        2006-11-29 21:57:37.000000000 +0000
84998 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/page.h      2007-02-02 19:10:55.000000000 +0000
84999 @@ -126,7 +126,9 @@ extern unsigned long max_low_pfn;
85000  # define pfn_valid(pfn)                (((pfn) >= min_low_pfn) && ((pfn) < max_low_pfn) && ia64_pfn_valid(pfn))
85001  #endif
85002  
85003 +#ifndef CONFIG_XEN
85004  #define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
85005 +#endif
85006  #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
85007  #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
85008  
85009 @@ -227,5 +229,53 @@ get_order (unsigned long size)
85010                                          (((current->personality & READ_IMPLIES_EXEC) != 0)     \
85011                                           ? VM_EXEC : 0))
85012  
85013 -# endif /* __KERNEL__ */
85014 +#ifndef __ASSEMBLY__
85015 +#ifdef CONFIG_XEN
85016 +
85017 +#include <linux/kernel.h>
85018 +#include <asm/hypervisor.h>
85019 +#include <xen/features.h>      // to compile netback, netfront
85020 +
85021 +/*
85022 + * XXX hack!
85023 + * Linux/IA64 uses PG_arch_1.
85024 + * This hack will be removed once PG_foreign bit is taken.
85025 + * #include <xen/foreign_page.h>
85026 + */
85027 +#ifdef __ASM_XEN_FOREIGN_PAGE_H__
85028 +# error "don't include include/xen/foreign_page.h!"
85029 +#endif
85030 +
85031 +extern struct address_space xen_ia64_foreign_dummy_mapping;
85032 +#define PageForeign(page)      \
85033 +       ((page)->mapping == &xen_ia64_foreign_dummy_mapping)
85034 +
85035 +#define SetPageForeign(page, dtor) do {                                \
85036 +       set_page_private((page), (unsigned long)(dtor));        \
85037 +       (page)->mapping = &xen_ia64_foreign_dummy_mapping;      \
85038 +       smp_rmb();                                              \
85039 +} while (0)
85040 +
85041 +#define ClearPageForeign(page) do {    \
85042 +       (page)->mapping = NULL;         \
85043 +       smp_rmb();                      \
85044 +       set_page_private((page), 0);    \
85045 +} while (0)
85046 +
85047 +#define PageForeignDestructor(page)    \
85048 +       ( (void (*) (struct page *)) page_private(page) )
85049 +
85050 +#define arch_free_page(_page,_order)                   \
85051 +({      int foreign = PageForeign(_page);               \
85052 +       if (foreign)                                    \
85053 +               (PageForeignDestructor(_page))(_page);  \
85054 +       foreign;                                        \
85055 +})
85056 +#define HAVE_ARCH_FREE_PAGE
85057 +
85058 +#include <asm/maddr.h>
85059 +
85060 +#endif /* CONFIG_XEN */
85061 +#endif /* __ASSEMBLY__ */
85062 +#endif /* __KERNEL__ */
85063  #endif /* _ASM_IA64_PAGE_H */
85064 diff -ruNp linux-2.6.19/include/asm-ia64/pal.h linux-2.6.19-xen-3.0.4/include/asm-ia64/pal.h
85065 --- linux-2.6.19/include/asm-ia64/pal.h 2006-11-29 21:57:37.000000000 +0000
85066 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/pal.h       2007-02-02 19:10:55.000000000 +0000
85067 @@ -83,6 +83,7 @@
85068  #ifndef __ASSEMBLY__
85069  
85070  #include <linux/types.h>
85071 +#include <asm/processor.h>
85072  #include <asm/fpu.h>
85073  
85074  /*
85075 diff -ruNp linux-2.6.19/include/asm-ia64/pgalloc.h linux-2.6.19-xen-3.0.4/include/asm-ia64/pgalloc.h
85076 --- linux-2.6.19/include/asm-ia64/pgalloc.h     2006-11-29 21:57:37.000000000 +0000
85077 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/pgalloc.h   2007-02-02 19:10:55.000000000 +0000
85078 @@ -125,7 +125,11 @@ static inline void pmd_free(pmd_t * pmd)
85079  static inline void
85080  pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte)
85081  {
85082 +#ifndef CONFIG_XEN
85083         pmd_val(*pmd_entry) = page_to_phys(pte);
85084 +#else
85085 +       pmd_val(*pmd_entry) = page_to_pseudophys(pte);
85086 +#endif
85087  }
85088  
85089  static inline void
85090 diff -ruNp linux-2.6.19/include/asm-ia64/privop.h linux-2.6.19-xen-3.0.4/include/asm-ia64/privop.h
85091 --- linux-2.6.19/include/asm-ia64/privop.h      1970-01-01 00:00:00.000000000 +0000
85092 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/privop.h    2007-02-02 19:10:55.000000000 +0000
85093 @@ -0,0 +1,60 @@
85094 +#ifndef _ASM_IA64_PRIVOP_H
85095 +#define _ASM_IA64_PRIVOP_H
85096 +
85097 +/*
85098 + * Copyright (C) 2005 Hewlett-Packard Co
85099 + *     Dan Magenheimer <dan.magenheimer@hp.com>
85100 + *
85101 + */
85102 +
85103 +#ifdef CONFIG_XEN
85104 +#include <asm/xen/privop.h>
85105 +#endif
85106 +
85107 +#ifndef __ASSEMBLY
85108 +
85109 +#ifndef IA64_PARAVIRTUALIZED
85110 +
85111 +#define ia64_getreg                    __ia64_getreg
85112 +#define ia64_setreg                    __ia64_setreg
85113 +#define ia64_hint                      __ia64_hint
85114 +#define ia64_thash                     __ia64_thash
85115 +#define ia64_itci                      __ia64_itci
85116 +#define ia64_itcd                      __ia64_itcd
85117 +#define ia64_itri                      __ia64_itri
85118 +#define ia64_itrd                      __ia64_itrd
85119 +#define ia64_tpa                       __ia64_tpa
85120 +#define ia64_set_ibr                   __ia64_set_ibr
85121 +#define ia64_set_pkr                   __ia64_set_pkr
85122 +#define ia64_set_pmc                   __ia64_set_pmc
85123 +#define ia64_set_pmd                   __ia64_set_pmd
85124 +#define ia64_set_rr                    __ia64_set_rr
85125 +#define ia64_get_cpuid                 __ia64_get_cpuid
85126 +#define ia64_get_ibr                   __ia64_get_ibr
85127 +#define ia64_get_pkr                   __ia64_get_pkr
85128 +#define ia64_get_pmc                   __ia64_get_pmc
85129 +#define ia64_get_pmd                   __ia64_get_pmd
85130 +#define ia64_get_rr                    __ia64_get_rr
85131 +#define ia64_fc                                __ia64_fc
85132 +#define ia64_ssm                       __ia64_ssm
85133 +#define ia64_rsm                       __ia64_rsm
85134 +#define ia64_ptce                      __ia64_ptce
85135 +#define ia64_ptcga                     __ia64_ptcga
85136 +#define ia64_ptcl                      __ia64_ptcl
85137 +#define ia64_ptri                      __ia64_ptri
85138 +#define ia64_ptrd                      __ia64_ptrd
85139 +#define ia64_get_psr_i                 __ia64_get_psr_i
85140 +#define ia64_intrin_local_irq_restore  __ia64_intrin_local_irq_restore
85141 +#define ia64_pal_halt_light            __ia64_pal_halt_light
85142 +#define ia64_leave_kernel              __ia64_leave_kernel
85143 +#define ia64_leave_syscall             __ia64_leave_syscall
85144 +#define ia64_trace_syscall             __ia64_trace_syscall
85145 +#define ia64_ret_from_clone            __ia64_ret_from_clone
85146 +#define ia64_switch_to                 __ia64_switch_to
85147 +#define ia64_pal_call_static           __ia64_pal_call_static
85148 +
85149 +#endif /* !IA64_PARAVIRTUALIZED */
85150 +
85151 +#endif /* !__ASSEMBLY */
85152 +
85153 +#endif /* _ASM_IA64_PRIVOP_H */
85154 diff -ruNp linux-2.6.19/include/asm-ia64/processor.h linux-2.6.19-xen-3.0.4/include/asm-ia64/processor.h
85155 --- linux-2.6.19/include/asm-ia64/processor.h   2006-11-29 21:57:37.000000000 +0000
85156 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/processor.h 2007-02-02 19:10:55.000000000 +0000
85157 @@ -18,6 +18,7 @@
85158  #include <asm/kregs.h>
85159  #include <asm/ptrace.h>
85160  #include <asm/ustack.h>
85161 +#include <asm/privop.h>
85162  
85163  #define IA64_NUM_DBG_REGS      8
85164  
85165 diff -ruNp linux-2.6.19/include/asm-ia64/sal.h linux-2.6.19-xen-3.0.4/include/asm-ia64/sal.h
85166 --- linux-2.6.19/include/asm-ia64/sal.h 2006-11-29 21:57:37.000000000 +0000
85167 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/sal.h       2007-02-02 19:10:55.000000000 +0000
85168 @@ -42,6 +42,9 @@
85169  #include <asm/pal.h>
85170  #include <asm/system.h>
85171  #include <asm/fpu.h>
85172 +#ifdef CONFIG_XEN
85173 +#include <asm/xen/xencomm.h>
85174 +#endif
85175  
85176  extern spinlock_t sal_lock;
85177  
85178 @@ -687,10 +690,28 @@ ia64_sal_clear_state_info (u64 sal_info_
85179  /* Get the processor and platform information logged by SAL with respect to the machine
85180   * state at the time of the MCAs, INITs, CMCs, or CPEs.
85181   */
85182 +#ifdef CONFIG_XEN
85183 +static inline u64 ia64_sal_get_state_info_size (u64 sal_info_type);
85184 +#endif
85185 +
85186  static inline u64
85187  ia64_sal_get_state_info (u64 sal_info_type, u64 *sal_info)
85188  {
85189         struct ia64_sal_retval isrv;
85190 +#ifdef CONFIG_XEN
85191 +       if (is_running_on_xen()) {
85192 +               struct xencomm_handle *desc;
85193 +
85194 +               if (xencomm_create(sal_info,
85195 +                                  ia64_sal_get_state_info_size(sal_info_type),
85196 +                                  &desc, GFP_KERNEL))
85197 +                       return 0;
85198 +
85199 +               SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
85200 +                                  desc, 0, 0, 0, 0);
85201 +               xencomm_free(desc);
85202 +       } else
85203 +#endif
85204         SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
85205                       sal_info, 0, 0, 0, 0);
85206         if (isrv.status)
85207 diff -ruNp linux-2.6.19/include/asm-ia64/synch_bitops.h linux-2.6.19-xen-3.0.4/include/asm-ia64/synch_bitops.h
85208 --- linux-2.6.19/include/asm-ia64/synch_bitops.h        1970-01-01 00:00:00.000000000 +0000
85209 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/synch_bitops.h      2007-02-02 19:10:55.000000000 +0000
85210 @@ -0,0 +1,61 @@
85211 +#ifndef __XEN_SYNCH_BITOPS_H__
85212 +#define __XEN_SYNCH_BITOPS_H__
85213 +
85214 +/*
85215 + * Copyright 1992, Linus Torvalds.
85216 + * Heavily modified to provide guaranteed strong synchronisation
85217 + * when communicating with Xen or other guest OSes running on other CPUs.
85218 + */
85219 +
85220 +#define ADDR (*(volatile long *) addr)
85221 +
85222 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
85223 +{
85224 +       set_bit(nr, addr);
85225 +}
85226 +
85227 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
85228 +{
85229 +       clear_bit(nr, addr);
85230 +}
85231 +
85232 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
85233 +{
85234 +       change_bit(nr, addr);
85235 +}
85236 +
85237 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
85238 +{
85239 +    return test_and_set_bit(nr, addr);
85240 +}
85241 +
85242 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
85243 +{
85244 +    return test_and_clear_bit(nr, addr);
85245 +}
85246 +
85247 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
85248 +{
85249 +    return test_and_change_bit(nr, addr);
85250 +}
85251 +
85252 +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
85253 +{
85254 +    return test_bit(nr, addr);
85255 +}
85256 +
85257 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
85258 +{
85259 +    return test_bit(nr, addr);
85260 +}
85261 +
85262 +#define synch_cmpxchg  ia64_cmpxchg4_acq
85263 +
85264 +#define synch_test_bit(nr,addr) \
85265 +(__builtin_constant_p(nr) ? \
85266 + synch_const_test_bit((nr),(addr)) : \
85267 + synch_var_test_bit((nr),(addr)))
85268 +
85269 +#define synch_cmpxchg_subword synch_cmpxchg
85270 +
85271 +#endif /* __XEN_SYNCH_BITOPS_H__ */
85272 diff -ruNp linux-2.6.19/include/asm-ia64/system.h linux-2.6.19-xen-3.0.4/include/asm-ia64/system.h
85273 --- linux-2.6.19/include/asm-ia64/system.h      2006-11-29 21:57:37.000000000 +0000
85274 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/system.h    2007-02-02 19:10:55.000000000 +0000
85275 @@ -123,7 +123,7 @@ extern struct ia64_boot_param {
85276  #define __local_irq_save(x)                    \
85277  do {                                           \
85278         ia64_stop();                            \
85279 -       (x) = ia64_getreg(_IA64_REG_PSR);       \
85280 +       (x) = ia64_get_psr_i();                 \
85281         ia64_stop();                            \
85282         ia64_rsm(IA64_PSR_I);                   \
85283  } while (0)
85284 @@ -171,7 +171,7 @@ do {                                                                \
85285  #endif /* !CONFIG_IA64_DEBUG_IRQ */
85286  
85287  #define local_irq_enable()     ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
85288 -#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
85289 +#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_get_psr_i(); })
85290  
85291  #define irqs_disabled()                                \
85292  ({                                             \
85293 diff -ruNp linux-2.6.19/include/asm-ia64/uaccess.h linux-2.6.19-xen-3.0.4/include/asm-ia64/uaccess.h
85294 --- linux-2.6.19/include/asm-ia64/uaccess.h     2006-11-29 21:57:37.000000000 +0000
85295 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/uaccess.h   2007-02-02 19:10:55.000000000 +0000
85296 @@ -365,6 +365,7 @@ ia64_done_with_exception (struct pt_regs
85297  }
85298  
85299  #define ARCH_HAS_TRANSLATE_MEM_PTR     1
85300 +#ifndef CONFIG_XEN
85301  static __inline__ char *
85302  xlate_dev_mem_ptr (unsigned long p)
85303  {
85304 @@ -379,6 +380,25 @@ xlate_dev_mem_ptr (unsigned long p)
85305  
85306         return ptr;
85307  }
85308 +#else
85309 +static __inline__ char *
85310 +xlate_dev_mem_ptr (unsigned long p, ssize_t sz)
85311 +{
85312 +       unsigned long pfn = p >> PAGE_SHIFT;
85313 +
85314 +       if (pfn_valid(pfn) && !PageUncached(pfn_to_page(pfn)))
85315 +               return __va(p);
85316 +
85317 +       return ioremap(p, sz);
85318 +}
85319 +
85320 +static __inline__ void
85321 +xlate_dev_mem_ptr_unmap (char* v)
85322 +{
85323 +       if (REGION_NUMBER(v) == RGN_UNCACHED)
85324 +               iounmap(v);
85325 +}
85326 +#endif
85327  
85328  /*
85329   * Convert a virtual cached kernel memory pointer to an uncached pointer
85330 diff -ruNp linux-2.6.19/include/asm-ia64/xen/privop.h linux-2.6.19-xen-3.0.4/include/asm-ia64/xen/privop.h
85331 --- linux-2.6.19/include/asm-ia64/xen/privop.h  1970-01-01 00:00:00.000000000 +0000
85332 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/xen/privop.h        2007-02-02 19:10:55.000000000 +0000
85333 @@ -0,0 +1,303 @@
85334 +#ifndef _ASM_IA64_XEN_PRIVOP_H
85335 +#define _ASM_IA64_XEN_PRIVOP_H
85336 +
85337 +/*
85338 + * Copyright (C) 2005 Hewlett-Packard Co
85339 + *     Dan Magenheimer <dan.magenheimer@hp.com>
85340 + *
85341 + * Paravirtualizations of privileged operations for Xen/ia64
85342 + *
85343 + */
85344 +
85345 +
85346 +#include <xen/interface/arch-ia64.h>
85347 +
85348 +#define IA64_PARAVIRTUALIZED
85349 +
85350 +/* At 1 MB, before per-cpu space but still addressable using addl instead
85351 +   of movl. */
85352 +#define XSI_BASE                               0xfffffffffff00000
85353 +
85354 +/* Address of mapped regs.  */
85355 +#define XMAPPEDREGS_BASE               (XSI_BASE + XSI_SIZE)
85356 +
85357 +#ifdef __ASSEMBLY__
85358 +#define        XEN_HYPER_RFI                   break HYPERPRIVOP_RFI
85359 +#define        XEN_HYPER_RSM_PSR_DT            break HYPERPRIVOP_RSM_DT
85360 +#define        XEN_HYPER_SSM_PSR_DT            break HYPERPRIVOP_SSM_DT
85361 +#define        XEN_HYPER_COVER                 break HYPERPRIVOP_COVER
85362 +#define        XEN_HYPER_ITC_D                 break HYPERPRIVOP_ITC_D
85363 +#define        XEN_HYPER_ITC_I                 break HYPERPRIVOP_ITC_I
85364 +#define        XEN_HYPER_SSM_I                 break HYPERPRIVOP_SSM_I
85365 +#define        XEN_HYPER_GET_IVR               break HYPERPRIVOP_GET_IVR
85366 +#define        XEN_HYPER_GET_TPR               break HYPERPRIVOP_GET_TPR
85367 +#define        XEN_HYPER_SET_TPR               break HYPERPRIVOP_SET_TPR
85368 +#define        XEN_HYPER_EOI                   break HYPERPRIVOP_EOI
85369 +#define        XEN_HYPER_SET_ITM               break HYPERPRIVOP_SET_ITM
85370 +#define        XEN_HYPER_THASH                 break HYPERPRIVOP_THASH
85371 +#define        XEN_HYPER_PTC_GA                break HYPERPRIVOP_PTC_GA
85372 +#define        XEN_HYPER_ITR_D                 break HYPERPRIVOP_ITR_D
85373 +#define        XEN_HYPER_GET_RR                break HYPERPRIVOP_GET_RR
85374 +#define        XEN_HYPER_SET_RR                break HYPERPRIVOP_SET_RR
85375 +#define        XEN_HYPER_SET_KR                break HYPERPRIVOP_SET_KR
85376 +#define        XEN_HYPER_FC                    break HYPERPRIVOP_FC
85377 +#define        XEN_HYPER_GET_CPUID             break HYPERPRIVOP_GET_CPUID
85378 +#define        XEN_HYPER_GET_PMD               break HYPERPRIVOP_GET_PMD
85379 +#define        XEN_HYPER_GET_EFLAG             break HYPERPRIVOP_GET_EFLAG
85380 +#define        XEN_HYPER_SET_EFLAG             break HYPERPRIVOP_SET_EFLAG
85381 +#define        XEN_HYPER_RSM_BE                break HYPERPRIVOP_RSM_BE
85382 +#define        XEN_HYPER_GET_PSR               break HYPERPRIVOP_GET_PSR
85383 +
85384 +#define XSI_IFS                        (XSI_BASE + XSI_IFS_OFS)
85385 +#define XSI_PRECOVER_IFS       (XSI_BASE + XSI_PRECOVER_IFS_OFS)
85386 +#define XSI_INCOMPL_REGFR      (XSI_BASE + XSI_INCOMPL_REGFR_OFS)
85387 +#define XSI_IFA                        (XSI_BASE + XSI_IFA_OFS)
85388 +#define XSI_ISR                        (XSI_BASE + XSI_ISR_OFS)
85389 +#define XSI_IIM                        (XSI_BASE + XSI_IIM_OFS)
85390 +#define XSI_ITIR               (XSI_BASE + XSI_ITIR_OFS)
85391 +#define XSI_PSR_I_ADDR         (XSI_BASE + XSI_PSR_I_ADDR_OFS)
85392 +#define XSI_PSR_IC             (XSI_BASE + XSI_PSR_IC_OFS)
85393 +#define XSI_IPSR               (XSI_BASE + XSI_IPSR_OFS)
85394 +#define XSI_IIP                        (XSI_BASE + XSI_IIP_OFS)
85395 +#define XSI_BANK1_R16          (XSI_BASE + XSI_BANK1_R16_OFS)
85396 +#define XSI_BANKNUM            (XSI_BASE + XSI_BANKNUM_OFS)
85397 +#define XSI_IHA                        (XSI_BASE + XSI_IHA_OFS)
85398 +#endif
85399 +
85400 +#ifndef __ASSEMBLY__
85401 +#define        XEN_HYPER_SSM_I         asm("break %0" : : "i" (HYPERPRIVOP_SSM_I))
85402 +#define        XEN_HYPER_GET_IVR       asm("break %0" : : "i" (HYPERPRIVOP_GET_IVR))
85403 +
85404 +/************************************************/
85405 +/* Instructions paravirtualized for correctness */
85406 +/************************************************/
85407 +
85408 +/* "fc" and "thash" are privilege-sensitive instructions, meaning they
85409 + *  may have different semantics depending on whether they are executed
85410 + *  at PL0 vs PL!=0.  When paravirtualized, these instructions mustn't
85411 + *  be allowed to execute directly, lest incorrect semantics result. */
85412 +extern unsigned long xen_fc(unsigned long addr);
85413 +#define ia64_fc(addr)                  xen_fc((unsigned long)(addr))
85414 +extern unsigned long xen_thash(unsigned long addr);
85415 +#define ia64_thash(addr)               xen_thash((unsigned long)(addr))
85416 +/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
85417 + * is not currently used (though it may be in a long-format VHPT system!)
85418 + * and the semantics of cover only change if psr.ic is off which is very
85419 + * rare (and currently non-existent outside of assembly code */
85420 +
85421 +/* There are also privilege-sensitive registers.  These registers are
85422 + * readable at any privilege level but only writable at PL0. */
85423 +extern unsigned long xen_get_cpuid(int index);
85424 +#define        ia64_get_cpuid(i)               xen_get_cpuid(i)
85425 +extern unsigned long xen_get_pmd(int index);
85426 +#define        ia64_get_pmd(i)                 xen_get_pmd(i)
85427 +extern unsigned long xen_get_eflag(void);      /* see xen_ia64_getreg */
85428 +extern void xen_set_eflag(unsigned long);      /* see xen_ia64_setreg */
85429 +
85430 +/************************************************/
85431 +/* Instructions paravirtualized for performance */
85432 +/************************************************/
85433 +
85434 +/* Xen uses memory-mapped virtual privileged registers for access to many
85435 + * performance-sensitive privileged registers.  Some, like the processor
85436 + * status register (psr), are broken up into multiple memory locations.
85437 + * Others, like "pend", are abstractions based on privileged registers.
85438 + * "Pend" is guaranteed to be set if reading cr.ivr would return a
85439 + * (non-spurious) interrupt. */
85440 +#define XEN_MAPPEDREGS ((struct mapped_regs *)XMAPPEDREGS_BASE)
85441 +#define XSI_PSR_I                      \
85442 +       (*XEN_MAPPEDREGS->interrupt_mask_addr)
85443 +#define xen_get_virtual_psr_i()                \
85444 +       (!XSI_PSR_I)
85445 +#define xen_set_virtual_psr_i(_val)    \
85446 +       ({ XSI_PSR_I = (uint8_t)(_val) ? 0 : 1; })
85447 +#define xen_set_virtual_psr_ic(_val)   \
85448 +       ({ XEN_MAPPEDREGS->interrupt_collection_enabled = _val ? 1 : 0; })
85449 +#define xen_get_virtual_pend()         \
85450 +       (*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1))
85451 +
85452 +/* Hyperprivops are "break" instructions with a well-defined API.
85453 + * In particular, the virtual psr.ic bit must be off; in this way
85454 + * it is guaranteed to never conflict with a linux break instruction.
85455 + * Normally, this is done in a xen stub but this one is frequent enough
85456 + * that we inline it */
85457 +#define xen_hyper_ssm_i()                                              \
85458 +({                                                                     \
85459 +       xen_set_virtual_psr_i(0);                                       \
85460 +       xen_set_virtual_psr_ic(0);                                      \
85461 +       XEN_HYPER_SSM_I;                                                \
85462 +})
85463 +
85464 +/* turning off interrupts can be paravirtualized simply by writing
85465 + * to a memory-mapped virtual psr.i bit (implemented as a 16-bit bool) */
85466 +#define xen_rsm_i()    xen_set_virtual_psr_i(0)
85467 +
85468 +/* turning on interrupts is a bit more complicated.. write to the
85469 + * memory-mapped virtual psr.i bit first (to avoid race condition),
85470 + * then if any interrupts were pending, we have to execute a hyperprivop
85471 + * to ensure the pending interrupt gets delivered; else we're done! */
85472 +#define xen_ssm_i()                                                    \
85473 +({                                                                     \
85474 +       int old = xen_get_virtual_psr_i();                              \
85475 +       xen_set_virtual_psr_i(1);                                       \
85476 +       if (!old && xen_get_virtual_pend()) xen_hyper_ssm_i();          \
85477 +})
85478 +
85479 +#define xen_ia64_intrin_local_irq_restore(x)                           \
85480 +{                                                                      \
85481 +     if (is_running_on_xen()) {                                                \
85482 +       if ((x) & IA64_PSR_I) { xen_ssm_i(); }                          \
85483 +       else { xen_rsm_i(); }                                           \
85484 +    }                                                                  \
85485 +    else __ia64_intrin_local_irq_restore((x));                         \
85486 +}
85487 +
85488 +#define        xen_get_psr_i()                                                 \
85489 +(                                                                      \
85490 +       (is_running_on_xen()) ?                                         \
85491 +               (xen_get_virtual_psr_i() ? IA64_PSR_I : 0)              \
85492 +               : __ia64_get_psr_i()                                    \
85493 +)
85494 +
85495 +#define xen_ia64_ssm(mask)                                             \
85496 +{                                                                      \
85497 +       if ((mask)==IA64_PSR_I) {                                       \
85498 +               if (is_running_on_xen()) { xen_ssm_i(); }                       \
85499 +               else { __ia64_ssm(mask); }                              \
85500 +       }                                                               \
85501 +       else { __ia64_ssm(mask); }                                      \
85502 +}
85503 +
85504 +#define xen_ia64_rsm(mask)                                             \
85505 +{                                                                      \
85506 +       if ((mask)==IA64_PSR_I) {                                       \
85507 +               if (is_running_on_xen()) { xen_rsm_i(); }                       \
85508 +               else { __ia64_rsm(mask); }                              \
85509 +       }                                                               \
85510 +       else { __ia64_rsm(mask); }                                      \
85511 +}
85512 +
85513 +
85514 +/* Although all privileged operations can be left to trap and will
85515 + * be properly handled by Xen, some are frequent enough that we use
85516 + * hyperprivops for performance. */
85517 +
85518 +extern unsigned long xen_get_ivr(void);
85519 +extern unsigned long xen_get_tpr(void);
85520 +extern void xen_set_itm(unsigned long);
85521 +extern void xen_set_tpr(unsigned long);
85522 +extern void xen_eoi(void);
85523 +extern void xen_set_rr(unsigned long index, unsigned long val);
85524 +extern unsigned long xen_get_rr(unsigned long index);
85525 +extern void xen_set_kr(unsigned long index, unsigned long val);
85526 +extern void xen_ptcga(unsigned long addr, unsigned long size);
85527 +
85528 +/* Note: It may look wrong to test for is_running_on_xen() in each case.
85529 + * However regnum is always a constant so, as written, the compiler
85530 + * eliminates the switch statement, whereas is_running_on_xen() must be
85531 + * tested dynamically. */
85532 +#define xen_ia64_getreg(regnum)                                                \
85533 +({                                                                     \
85534 +       __u64 ia64_intri_res;                                           \
85535 +                                                                       \
85536 +       switch(regnum) {                                                \
85537 +       case _IA64_REG_CR_IVR:                                          \
85538 +               ia64_intri_res = (is_running_on_xen()) ?                        \
85539 +                       xen_get_ivr() :                                 \
85540 +                       __ia64_getreg(regnum);                          \
85541 +               break;                                                  \
85542 +       case _IA64_REG_CR_TPR:                                          \
85543 +               ia64_intri_res = (is_running_on_xen()) ?                        \
85544 +                       xen_get_tpr() :                                 \
85545 +                       __ia64_getreg(regnum);                          \
85546 +               break;                                                  \
85547 +       case _IA64_REG_AR_EFLAG:                                        \
85548 +               ia64_intri_res = (is_running_on_xen()) ?                        \
85549 +                       xen_get_eflag() :                               \
85550 +                       __ia64_getreg(regnum);                          \
85551 +               break;                                                  \
85552 +       default:                                                        \
85553 +               ia64_intri_res = __ia64_getreg(regnum);                 \
85554 +               break;                                                  \
85555 +       }                                                               \
85556 +       ia64_intri_res;                                                 \
85557 +})
85558 +
85559 +#define xen_ia64_setreg(regnum,val)                                    \
85560 +({                                                                     \
85561 +       switch(regnum) {                                                \
85562 +       case _IA64_REG_AR_KR0 ... _IA64_REG_AR_KR7:                     \
85563 +               (is_running_on_xen()) ?                                 \
85564 +                       xen_set_kr((regnum-_IA64_REG_AR_KR0), val) :    \
85565 +                       __ia64_setreg(regnum,val);                      \
85566 +               break;                                                  \
85567 +       case _IA64_REG_CR_ITM:                                          \
85568 +               (is_running_on_xen()) ?                                 \
85569 +                       xen_set_itm(val) :                              \
85570 +                       __ia64_setreg(regnum,val);                      \
85571 +               break;                                                  \
85572 +       case _IA64_REG_CR_TPR:                                          \
85573 +               (is_running_on_xen()) ?                                 \
85574 +                       xen_set_tpr(val) :                              \
85575 +                       __ia64_setreg(regnum,val);                      \
85576 +               break;                                                  \
85577 +       case _IA64_REG_CR_EOI:                                          \
85578 +               (is_running_on_xen()) ?                                 \
85579 +                       xen_eoi() :                                     \
85580 +                       __ia64_setreg(regnum,val);                      \
85581 +               break;                                                  \
85582 +       case _IA64_REG_AR_EFLAG:                                        \
85583 +               (is_running_on_xen()) ?                                 \
85584 +                       xen_set_eflag(val) :                            \
85585 +                       __ia64_setreg(regnum,val);                      \
85586 +               break;                                                  \
85587 +       default:                                                        \
85588 +               __ia64_setreg(regnum,val);                              \
85589 +               break;                                                  \
85590 +       }                                                               \
85591 +})
85592 +
85593 +#define ia64_ssm                       xen_ia64_ssm
85594 +#define ia64_rsm                       xen_ia64_rsm
85595 +#define ia64_intrin_local_irq_restore  xen_ia64_intrin_local_irq_restore
85596 +#define        ia64_ptcga                      xen_ptcga
85597 +#define        ia64_set_rr(index,val)          xen_set_rr(index,val)
85598 +#define        ia64_get_rr(index)              xen_get_rr(index)
85599 +#define ia64_getreg                    xen_ia64_getreg
85600 +#define ia64_setreg                    xen_ia64_setreg
85601 +#define        ia64_get_psr_i                  xen_get_psr_i
85602 +
85603 +/* the remainder of these are not performance-sensitive so its
85604 + * OK to not paravirtualize and just take a privop trap and emulate */
85605 +#define ia64_hint                      __ia64_hint
85606 +#define ia64_set_pmd                   __ia64_set_pmd
85607 +#define ia64_itci                      __ia64_itci
85608 +#define ia64_itcd                      __ia64_itcd
85609 +#define ia64_itri                      __ia64_itri
85610 +#define ia64_itrd                      __ia64_itrd
85611 +#define ia64_tpa                       __ia64_tpa
85612 +#define ia64_set_ibr                   __ia64_set_ibr
85613 +#define ia64_set_pkr                   __ia64_set_pkr
85614 +#define ia64_set_pmc                   __ia64_set_pmc
85615 +#define ia64_get_ibr                   __ia64_get_ibr
85616 +#define ia64_get_pkr                   __ia64_get_pkr
85617 +#define ia64_get_pmc                   __ia64_get_pmc
85618 +#define ia64_ptce                      __ia64_ptce
85619 +#define ia64_ptcl                      __ia64_ptcl
85620 +#define ia64_ptri                      __ia64_ptri
85621 +#define ia64_ptrd                      __ia64_ptrd
85622 +
85623 +#endif /* !__ASSEMBLY__ */
85624 +
85625 +/* these routines utilize privilege-sensitive or performance-sensitive
85626 + * privileged instructions so the code must be replaced with
85627 + * paravirtualized versions */
85628 +#define ia64_pal_halt_light            xen_pal_halt_light
85629 +#define        ia64_leave_kernel               xen_leave_kernel
85630 +#define        ia64_leave_syscall              xen_leave_syscall
85631 +#define        ia64_trace_syscall              xen_trace_syscall
85632 +#define        ia64_ret_from_clone             xen_ret_from_clone
85633 +#define        ia64_switch_to                  xen_switch_to
85634 +#define        ia64_pal_call_static            xen_pal_call_static
85635 +
85636 +#endif /* _ASM_IA64_XEN_PRIVOP_H */
85637 diff -ruNp linux-2.6.19/include/asm-ia64/xen/xcom_hcall.h linux-2.6.19-xen-3.0.4/include/asm-ia64/xen/xcom_hcall.h
85638 --- linux-2.6.19/include/asm-ia64/xen/xcom_hcall.h      1970-01-01 00:00:00.000000000 +0000
85639 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/xen/xcom_hcall.h    2007-02-02 19:10:55.000000000 +0000
85640 @@ -0,0 +1,86 @@
85641 +/*
85642 + * Copyright (C) 2006 Tristan Gingold <tristan.gingold@bull.net>, Bull SAS
85643 + *
85644 + * This program is free software; you can redistribute it and/or modify
85645 + * it under the terms of the GNU General Public License as published by
85646 + * the Free Software Foundation; either version 2 of the License, or
85647 + * (at your option) any later version.
85648 + * 
85649 + * This program is distributed in the hope that it will be useful,
85650 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
85651 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
85652 + * GNU General Public License for more details.
85653 + * 
85654 + * You should have received a copy of the GNU General Public License
85655 + * along with this program; if not, write to the Free Software
85656 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
85657 + */
85658 +
85659 +#ifndef _LINUX_XENCOMM_HCALL_H_
85660 +#define _LINUX_XENCOMM_HCALL_H_
85661 +
85662 +/* These function creates inline descriptor for the parameters and
85663 +   calls the corresponding xencomm_arch_hypercall_X.
85664 +   Architectures should defines HYPERVISOR_xxx as xencomm_hypercall_xxx unless
85665 +   they want to use their own wrapper.  */
85666 +extern int xencomm_hypercall_console_io(int cmd, int count, char *str);
85667 +
85668 +extern int xencomm_hypercall_event_channel_op(int cmd, void *op);
85669 +
85670 +extern int xencomm_hypercall_xen_version(int cmd, void *arg);
85671 +
85672 +extern int xencomm_hypercall_physdev_op(int cmd, void *op);
85673 +
85674 +extern int xencomm_hypercall_grant_table_op(unsigned int cmd, void *op,
85675 +                                            unsigned int count);
85676 +
85677 +extern int xencomm_hypercall_sched_op(int cmd, void *arg);
85678 +
85679 +extern int xencomm_hypercall_multicall(void *call_list, int nr_calls);
85680 +
85681 +extern int xencomm_hypercall_callback_op(int cmd, void *arg);
85682 +
85683 +extern int xencomm_hypercall_memory_op(unsigned int cmd, void *arg);
85684 +
85685 +extern unsigned long xencomm_hypercall_hvm_op(int cmd, void *arg);
85686 +
85687 +extern int xencomm_hypercall_suspend(unsigned long srec);
85688 +
85689 +extern int xencomm_hypercall_xenoprof_op(int op, void *arg);
85690 +
85691 +extern int xencomm_hypercall_perfmon_op(unsigned long cmd, void* arg,
85692 +                                        unsigned long count);
85693 +
85694 +/* Using mini xencomm.  */
85695 +extern int xencomm_mini_hypercall_console_io(int cmd, int count, char *str);
85696 +
85697 +extern int xencomm_mini_hypercall_event_channel_op(int cmd, void *op);
85698 +
85699 +extern int xencomm_mini_hypercall_xen_version(int cmd, void *arg);
85700 +
85701 +extern int xencomm_mini_hypercall_physdev_op(int cmd, void *op);
85702 +
85703 +extern int xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op,
85704 +                                                 unsigned int count);
85705 +
85706 +extern int xencomm_mini_hypercall_sched_op(int cmd, void *arg);
85707 +
85708 +extern int xencomm_mini_hypercall_multicall(void *call_list, int nr_calls);
85709 +
85710 +extern int xencomm_mini_hypercall_callback_op(int cmd, void *arg);
85711 +
85712 +extern int xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg);
85713 +
85714 +extern unsigned long xencomm_mini_hypercall_hvm_op(int cmd, void *arg);
85715 +
85716 +extern int xencomm_mini_hypercall_xenoprof_op(int op, void *arg);
85717 +
85718 +extern int xencomm_mini_hypercall_perfmon_op(unsigned long cmd, void* arg,
85719 +                                             unsigned long count);
85720 +
85721 +/* For privcmd.  Locally declare argument type to avoid include storm.
85722 +   Type coherency will be checked within privcmd.c  */
85723 +struct privcmd_hypercall;
85724 +extern int privcmd_hypercall(struct privcmd_hypercall *hypercall);
85725 +
85726 +#endif /* _LINUX_XENCOMM_HCALL_H_ */
85727 diff -ruNp linux-2.6.19/include/asm-ia64/xen/xencomm.h linux-2.6.19-xen-3.0.4/include/asm-ia64/xen/xencomm.h
85728 --- linux-2.6.19/include/asm-ia64/xen/xencomm.h 1970-01-01 00:00:00.000000000 +0000
85729 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/xen/xencomm.h       2007-02-02 19:10:55.000000000 +0000
85730 @@ -0,0 +1,60 @@
85731 +/*
85732 + * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation
85733 + *
85734 + * This program is free software; you can redistribute it and/or modify
85735 + * it under the terms of the GNU General Public License as published by
85736 + * the Free Software Foundation; either version 2 of the License, or
85737 + * (at your option) any later version.
85738 + * 
85739 + * This program is distributed in the hope that it will be useful,
85740 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
85741 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
85742 + * GNU General Public License for more details.
85743 + * 
85744 + * You should have received a copy of the GNU General Public License
85745 + * along with this program; if not, write to the Free Software
85746 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
85747 + */
85748 +
85749 +#ifndef _LINUX_XENCOMM_H_
85750 +#define _LINUX_XENCOMM_H_
85751 +
85752 +#include <xen/interface/xencomm.h>
85753 +
85754 +#define XENCOMM_MINI_ADDRS 3
85755 +struct xencomm_mini {
85756 +       struct xencomm_desc _desc;
85757 +       uint64_t address[XENCOMM_MINI_ADDRS];
85758 +};
85759 +
85760 +/* Must be called before any hypercall.  */
85761 +extern void xencomm_init (void);
85762 +
85763 +/* To avoid additionnal virt to phys conversion, an opaque structure is
85764 +   presented.  */
85765 +struct xencomm_handle;
85766 +
85767 +extern int xencomm_create(void *buffer, unsigned long bytes,
85768 +                          struct xencomm_handle **desc, gfp_t type);
85769 +extern void xencomm_free(struct xencomm_handle *desc);
85770 +
85771 +extern int xencomm_create_mini(struct xencomm_mini *area, int *nbr_area,
85772 +                               void *buffer, unsigned long bytes,
85773 +                               struct xencomm_handle **ret);
85774 +
85775 +/* Translate virtual address to physical address.  */
85776 +extern unsigned long xencomm_vaddr_to_paddr(unsigned long vaddr);
85777 +
85778 +/* Inline version.  To be used only on linear space (kernel space).  */
85779 +static inline struct xencomm_handle *
85780 +xencomm_create_inline(void *buffer)
85781 +{
85782 +       unsigned long paddr;
85783 +
85784 +       paddr = xencomm_vaddr_to_paddr((unsigned long)buffer);
85785 +       return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
85786 +}
85787 +
85788 +#define xen_guest_handle(hnd)  ((hnd).p)
85789 +
85790 +#endif /* _LINUX_XENCOMM_H_ */
85791 diff -ruNp linux-2.6.19/include/asm-ia64/xenoprof.h linux-2.6.19-xen-3.0.4/include/asm-ia64/xenoprof.h
85792 --- linux-2.6.19/include/asm-ia64/xenoprof.h    1970-01-01 00:00:00.000000000 +0000
85793 +++ linux-2.6.19-xen-3.0.4/include/asm-ia64/xenoprof.h  2007-02-02 19:10:55.000000000 +0000
85794 @@ -0,0 +1,48 @@
85795 +/******************************************************************************
85796 + * asm-ia64/xenoprof.h
85797 + *
85798 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
85799 + *                    VA Linux Systems Japan K.K.
85800 + *
85801 + * This program is free software; you can redistribute it and/or modify
85802 + * it under the terms of the GNU General Public License as published by
85803 + * the Free Software Foundation; either version 2 of the License, or
85804 + * (at your option) any later version.
85805 + *
85806 + * This program is distributed in the hope that it will be useful,
85807 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
85808 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
85809 + * GNU General Public License for more details.
85810 + *
85811 + * You should have received a copy of the GNU General Public License
85812 + * along with this program; if not, write to the Free Software
85813 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
85814 + *
85815 + */
85816 +#ifndef __ASM_XENOPROF_H__
85817 +#define __ASM_XENOPROF_H__
85818 +#ifdef CONFIG_XEN
85819 +
85820 +#undef HAVE_XENOPROF_CREATE_FILES
85821 +
85822 +struct xenoprof_init;
85823 +void xenoprof_arch_init_counter(struct xenoprof_init *init);
85824 +void xenoprof_arch_counter(void);
85825 +void xenoprof_arch_start(void);
85826 +void xenoprof_arch_stop(void);
85827 +
85828 +struct xenoprof_arch_shared_buffer {
85829 +       struct resource*        res;
85830 +};
85831 +
85832 +struct xenoprof_shared_buffer;
85833 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
85834 +struct xenoprof_get_buffer;
85835 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer,
85836 +                                    struct xenoprof_shared_buffer* sbuf);
85837 +struct xenoprof_passive;
85838 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain,
85839 +                              struct xenoprof_shared_buffer* sbuf);
85840 +
85841 +#endif /* CONFIG_XEN */
85842 +#endif /* __ASM_XENOPROF_H__ */
85843 diff -ruNp linux-2.6.19/include/asm-um/page.h linux-2.6.19-xen-3.0.4/include/asm-um/page.h
85844 --- linux-2.6.19/include/asm-um/page.h  2006-11-29 21:57:37.000000000 +0000
85845 +++ linux-2.6.19-xen-3.0.4/include/asm-um/page.h        2007-02-02 19:10:57.000000000 +0000
85846 @@ -114,7 +114,7 @@ extern unsigned long uml_physmem;
85847  extern struct page *arch_validate(struct page *page, gfp_t mask, int order);
85848  #define HAVE_ARCH_VALIDATE
85849  
85850 -extern void arch_free_page(struct page *page, int order);
85851 +extern int arch_free_page(struct page *page, int order);
85852  #define HAVE_ARCH_FREE_PAGE
85853  
85854  #include <asm-generic/memory_model.h>
85855 diff -ruNp linux-2.6.19/include/asm-x86_64/acpi.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/acpi.h
85856 --- linux-2.6.19/include/asm-x86_64/acpi.h      2006-11-29 21:57:37.000000000 +0000
85857 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/acpi.h    2007-02-02 19:10:57.000000000 +0000
85858 @@ -160,7 +160,7 @@ extern int acpi_pci_disabled;
85859  
85860  extern u8 x86_acpiid_to_apicid[];
85861  
85862 -#define ARCH_HAS_POWER_INIT 1
85863 +#define ARCH_HAS_POWER_INIT    1
85864  
85865  extern int acpi_skip_timer_override;
85866  extern int acpi_use_timer_override;
85867 diff -ruNp linux-2.6.19/include/asm-x86_64/apic.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/apic.h
85868 --- linux-2.6.19/include/asm-x86_64/apic.h      2006-11-29 21:57:37.000000000 +0000
85869 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/apic.h    2007-02-02 19:10:58.000000000 +0000
85870 @@ -95,11 +95,13 @@ extern void setup_APIC_extened_lvt(unsig
85871  #define K8_APIC_EXT_INT_MSG_EXT 0x7
85872  #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD    0
85873  
85874 +#ifndef CONFIG_XEN
85875  void smp_send_timer_broadcast_ipi(void);
85876  void switch_APIC_timer_to_ipi(void *cpumask);
85877  void switch_ipi_to_APIC_timer(void *cpumask);
85878  
85879  #define ARCH_APICTIMER_STOPS_ON_C3     1
85880 +#endif
85881  
85882  extern unsigned boot_cpu_id;
85883  
85884 diff -ruNp linux-2.6.19/include/asm-x86_64/dwarf2.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/dwarf2.h
85885 --- linux-2.6.19/include/asm-x86_64/dwarf2.h    2006-11-29 21:57:37.000000000 +0000
85886 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/dwarf2.h  2007-02-02 19:10:58.000000000 +0000
85887 @@ -13,7 +13,7 @@
85888     away for older version. 
85889   */
85890  
85891 -#ifdef CONFIG_AS_CFI
85892 +#ifdef CONFIG_UNWIND_INFO
85893  
85894  #define CFI_STARTPROC .cfi_startproc
85895  #define CFI_ENDPROC .cfi_endproc
85896 diff -ruNp linux-2.6.19/include/asm-x86_64/ipi.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/ipi.h
85897 --- linux-2.6.19/include/asm-x86_64/ipi.h       2006-11-29 21:57:37.000000000 +0000
85898 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/ipi.h     2007-02-02 19:10:58.000000000 +0000
85899 @@ -49,8 +49,12 @@ static inline int __prepare_ICR2 (unsign
85900         return SET_APIC_DEST_FIELD(mask);
85901  }
85902  
85903 +
85904  static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
85905  {
85906 +#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
85907 +       BUG();
85908 +#else
85909         /*
85910          * Subtle. In the case of the 'never do double writes' workaround
85911          * we have to lock out interrupts to be safe.  As we don't care
85912 @@ -74,6 +78,7 @@ static inline void __send_IPI_shortcut(u
85913          * Send the IPI. The write to APIC_ICR fires this off.
85914          */
85915         apic_write(APIC_ICR, cfg);
85916 +#endif /* !CONFIG_XEN_UNPRIVILEGED_GUEST */
85917  }
85918  
85919  
85920 diff -ruNp linux-2.6.19/include/asm-x86_64/kexec.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/kexec.h
85921 --- linux-2.6.19/include/asm-x86_64/kexec.h     2006-11-29 21:57:37.000000000 +0000
85922 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/kexec.h   2007-02-02 19:10:58.000000000 +0000
85923 @@ -91,6 +91,19 @@ relocate_kernel(unsigned long indirectio
85924                 unsigned long page_list,
85925                 unsigned long start_address) ATTRIB_NORET;
85926  
85927 +/* Under Xen we need to work with machine addresses. These macros give the
85928 + * machine address of a certain page to the generic kexec code instead of 
85929 + * the pseudo physical address which would be given by the default macros.
85930 + */
85931 +
85932 +#ifdef CONFIG_XEN
85933 +#define KEXEC_ARCH_HAS_PAGE_MACROS
85934 +#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
85935 +#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
85936 +#define kexec_virt_to_phys(addr) virt_to_machine(addr)
85937 +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
85938 +#endif
85939 +
85940  #endif /* __ASSEMBLY__ */
85941  
85942  #endif /* _X86_64_KEXEC_H */
85943 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/agp.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/agp.h
85944 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/agp.h  1970-01-01 00:00:00.000000000 +0000
85945 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/agp.h        2007-02-02 19:10:58.000000000 +0000
85946 @@ -0,0 +1,35 @@
85947 +#ifndef AGP_H
85948 +#define AGP_H 1
85949 +
85950 +#include <asm/cacheflush.h>
85951 +#include <asm/system.h>
85952 +
85953 +/*
85954 + * Functions to keep the agpgart mappings coherent.
85955 + * The GART gives the CPU a physical alias of memory. The alias is
85956 + * mapped uncacheable. Make sure there are no conflicting mappings
85957 + * with different cachability attributes for the same page.
85958 + */
85959 +
85960 +int map_page_into_agp(struct page *page);
85961 +int unmap_page_from_agp(struct page *page);
85962 +#define flush_agp_mappings() global_flush_tlb()
85963 +
85964 +/* Could use CLFLUSH here if the cpu supports it. But then it would
85965 +   need to be called for each cacheline of the whole page so it may not be
85966 +   worth it. Would need a page for it. */
85967 +#define flush_agp_cache() wbinvd()
85968 +
85969 +/* Convert a physical address to an address suitable for the GART. */
85970 +#define phys_to_gart(x) phys_to_machine(x)
85971 +#define gart_to_phys(x) machine_to_phys(x)
85972 +
85973 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
85974 +#define alloc_gatt_pages(order)        ({                                          \
85975 +       char *_t; dma_addr_t _d;                                            \
85976 +       _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
85977 +       _t; })
85978 +#define free_gatt_pages(table, order)  \
85979 +       dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
85980 +
85981 +#endif
85982 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/arch_hooks.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/arch_hooks.h
85983 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/arch_hooks.h   1970-01-01 00:00:00.000000000 +0000
85984 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/arch_hooks.h 2007-02-02 19:10:58.000000000 +0000
85985 @@ -0,0 +1,27 @@
85986 +#ifndef _ASM_ARCH_HOOKS_H
85987 +#define _ASM_ARCH_HOOKS_H
85988 +
85989 +#include <linux/interrupt.h>
85990 +
85991 +/*
85992 + *     linux/include/asm/arch_hooks.h
85993 + *
85994 + *     define the architecture specific hooks 
85995 + */
85996 +
85997 +/* these aren't arch hooks, they are generic routines
85998 + * that can be used by the hooks */
85999 +extern void init_ISA_irqs(void);
86000 +extern void apic_intr_init(void);
86001 +extern void smp_intr_init(void);
86002 +extern irqreturn_t timer_interrupt(int irq, void *dev_id);
86003 +
86004 +/* these are the defined hooks */
86005 +extern void intr_init_hook(void);
86006 +extern void pre_intr_init_hook(void);
86007 +extern void pre_setup_arch_hook(void);
86008 +extern void trap_init_hook(void);
86009 +extern void time_init_hook(void);
86010 +extern void mca_nmi_hook(void);
86011 +
86012 +#endif
86013 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/bootsetup.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/bootsetup.h
86014 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/bootsetup.h    1970-01-01 00:00:00.000000000 +0000
86015 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/bootsetup.h  2007-02-02 19:10:58.000000000 +0000
86016 @@ -0,0 +1,40 @@
86017 +
86018 +#ifndef _X86_64_BOOTSETUP_H
86019 +#define _X86_64_BOOTSETUP_H 1
86020 +
86021 +#define BOOT_PARAM_SIZE                4096
86022 +extern char x86_boot_params[BOOT_PARAM_SIZE];
86023 +
86024 +/*
86025 + * This is set up by the setup-routine at boot-time
86026 + */
86027 +#define PARAM  ((unsigned char *)x86_boot_params)
86028 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
86029 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
86030 +#define ALT_MEM_K (*(unsigned int *) (PARAM+0x1e0))
86031 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
86032 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
86033 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
86034 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
86035 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
86036 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
86037 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
86038 +#define SAVED_VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
86039 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
86040 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
86041 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
86042 +#define KERNEL_START (*(unsigned int *) (PARAM+0x214))
86043 +#define INITRD_START (__pa(xen_start_info->mod_start))
86044 +#define INITRD_SIZE (xen_start_info->mod_len)
86045 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
86046 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
86047 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
86048 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
86049 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
86050 +#define COMMAND_LINE saved_command_line
86051 +
86052 +#define RAMDISK_IMAGE_START_MASK       0x07FF
86053 +#define RAMDISK_PROMPT_FLAG            0x8000
86054 +#define RAMDISK_LOAD_FLAG              0x4000  
86055 +
86056 +#endif
86057 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/desc.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/desc.h
86058 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/desc.h 1970-01-01 00:00:00.000000000 +0000
86059 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/desc.h       2007-02-02 19:10:58.000000000 +0000
86060 @@ -0,0 +1,263 @@
86061 +/* Written 2000 by Andi Kleen */ 
86062 +#ifndef __ARCH_DESC_H
86063 +#define __ARCH_DESC_H
86064 +
86065 +#include <linux/threads.h>
86066 +#include <asm/ldt.h>
86067 +
86068 +#ifndef __ASSEMBLY__
86069 +
86070 +#include <linux/string.h>
86071 +#include <linux/smp.h>
86072 +
86073 +#include <asm/segment.h>
86074 +#include <asm/mmu.h>
86075 +
86076 +// 8 byte segment descriptor
86077 +struct desc_struct { 
86078 +       u16 limit0;
86079 +       u16 base0;
86080 +       unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
86081 +       unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
86082 +} __attribute__((packed)); 
86083 +
86084 +struct n_desc_struct { 
86085 +       unsigned int a,b;
86086 +};     
86087 +
86088 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
86089 +
86090 +enum { 
86091 +       GATE_INTERRUPT = 0xE, 
86092 +       GATE_TRAP = 0xF,        
86093 +       GATE_CALL = 0xC,
86094 +};     
86095 +
86096 +// 16byte gate
86097 +struct gate_struct {          
86098 +       u16 offset_low;
86099 +       u16 segment; 
86100 +       unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
86101 +       u16 offset_middle;
86102 +       u32 offset_high;
86103 +       u32 zero1; 
86104 +} __attribute__((packed));
86105 +
86106 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) 
86107 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
86108 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
86109 +
86110 +enum { 
86111 +       DESC_TSS = 0x9,
86112 +       DESC_LDT = 0x2,
86113 +}; 
86114 +
86115 +// LDT or TSS descriptor in the GDT. 16 bytes.
86116 +struct ldttss_desc { 
86117 +       u16 limit0;
86118 +       u16 base0;
86119 +       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
86120 +       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
86121 +       u32 base3;
86122 +       u32 zero1; 
86123 +} __attribute__((packed)); 
86124 +
86125 +struct desc_ptr {
86126 +       unsigned short size;
86127 +       unsigned long address;
86128 +} __attribute__((packed)) ;
86129 +
86130 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
86131 +
86132 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
86133 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
86134 +
86135 +static inline void clear_LDT(void)
86136 +{
86137 +       int cpu = get_cpu();
86138 +
86139 +       /*
86140 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
86141 +        * it slows down context switching. Noone uses it anyway.
86142 +        */
86143 +       cpu = cpu;              /* XXX avoid compiler warning */
86144 +       xen_set_ldt(0UL, 0);
86145 +       put_cpu();
86146 +}
86147 +
86148 +/*
86149 + * This is the ldt that every process will get unless we need
86150 + * something other than this.
86151 + */
86152 +extern struct desc_struct default_ldt[];
86153 +#ifndef CONFIG_X86_NO_IDT
86154 +extern struct gate_struct idt_table[]; 
86155 +#endif
86156 +extern struct desc_ptr cpu_gdt_descr[];
86157 +
86158 +/* the cpu gdt accessor */
86159 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
86160 +
86161 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)  
86162 +{
86163 +       struct gate_struct s;   
86164 +       s.offset_low = PTR_LOW(func); 
86165 +       s.segment = __KERNEL_CS;
86166 +       s.ist = ist; 
86167 +       s.p = 1;
86168 +       s.dpl = dpl; 
86169 +       s.zero0 = 0;
86170 +       s.zero1 = 0; 
86171 +       s.type = type; 
86172 +       s.offset_middle = PTR_MIDDLE(func); 
86173 +       s.offset_high = PTR_HIGH(func); 
86174 +       /* does not need to be atomic because it is only done once at setup time */ 
86175 +       memcpy(adr, &s, 16); 
86176 +} 
86177 +
86178 +#ifndef CONFIG_X86_NO_IDT
86179 +static inline void set_intr_gate(int nr, void *func) 
86180 +{ 
86181 +       BUG_ON((unsigned)nr > 0xFF);
86182 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
86183 +} 
86184 +
86185 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
86186 +{ 
86187 +       BUG_ON((unsigned)nr > 0xFF);
86188 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
86189 +} 
86190 +
86191 +static inline void set_system_gate(int nr, void *func) 
86192 +{ 
86193 +       BUG_ON((unsigned)nr > 0xFF);
86194 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
86195 +} 
86196 +
86197 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
86198 +{
86199 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
86200 +}
86201 +#endif
86202 +
86203 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, 
86204 +                                        unsigned size) 
86205 +{ 
86206 +       struct ldttss_desc d;
86207 +       memset(&d,0,sizeof(d)); 
86208 +       d.limit0 = size & 0xFFFF;
86209 +       d.base0 = PTR_LOW(tss); 
86210 +       d.base1 = PTR_MIDDLE(tss) & 0xFF; 
86211 +       d.type = type;
86212 +       d.p = 1; 
86213 +       d.limit1 = (size >> 16) & 0xF;
86214 +       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; 
86215 +       d.base3 = PTR_HIGH(tss); 
86216 +       memcpy(ptr, &d, 16); 
86217 +}
86218 +
86219 +#ifndef CONFIG_X86_NO_TSS
86220 +static inline void set_tss_desc(unsigned cpu, void *addr)
86221 +{ 
86222 +       /*
86223 +        * sizeof(unsigned long) coming from an extra "long" at the end
86224 +        * of the iobitmap. See tss_struct definition in processor.h
86225 +        *
86226 +        * -1? seg base+limit should be pointing to the address of the
86227 +        * last valid byte
86228 +        */
86229 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
86230 +               (unsigned long)addr, DESC_TSS,
86231 +               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
86232 +} 
86233 +#endif
86234 +
86235 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
86236 +{ 
86237 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
86238 +                             DESC_LDT, size * 8 - 1);
86239 +}
86240 +
86241 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
86242 +{ 
86243 +       struct desc_struct *d = &cpu_gdt(cpu)[entry];
86244 +       u32 addr = (u32)(u64)base;
86245 +       BUG_ON((u64)base >> 32); 
86246 +       d->base0 = addr & 0xffff;
86247 +       d->base1 = (addr >> 16) & 0xff;
86248 +       d->base2 = (addr >> 24) & 0xff;
86249 +} 
86250 +
86251 +#define LDT_entry_a(info) \
86252 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
86253 +/* Don't allow setting of the lm bit. It is useless anyways because 
86254 +   64bit system calls require __USER_CS. */ 
86255 +#define LDT_entry_b(info) \
86256 +       (((info)->base_addr & 0xff000000) | \
86257 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
86258 +       ((info)->limit & 0xf0000) | \
86259 +       (((info)->read_exec_only ^ 1) << 9) | \
86260 +       ((info)->contents << 10) | \
86261 +       (((info)->seg_not_present ^ 1) << 15) | \
86262 +       ((info)->seg_32bit << 22) | \
86263 +       ((info)->limit_in_pages << 23) | \
86264 +       ((info)->useable << 20) | \
86265 +       /* ((info)->lm << 21) | */ \
86266 +       0x7000)
86267 +
86268 +#define LDT_empty(info) (\
86269 +       (info)->base_addr       == 0    && \
86270 +       (info)->limit           == 0    && \
86271 +       (info)->contents        == 0    && \
86272 +       (info)->read_exec_only  == 1    && \
86273 +       (info)->seg_32bit       == 0    && \
86274 +       (info)->limit_in_pages  == 0    && \
86275 +       (info)->seg_not_present == 1    && \
86276 +       (info)->useable         == 0    && \
86277 +       (info)->lm              == 0)
86278 +
86279 +#if TLS_SIZE != 24
86280 +# error update this code.
86281 +#endif
86282 +
86283 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
86284 +{
86285 +#if 0
86286 +       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
86287 +       gdt[0] = t->tls_array[0];
86288 +       gdt[1] = t->tls_array[1];
86289 +       gdt[2] = t->tls_array[2];
86290 +#endif
86291 +#define C(i) \
86292 +       HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i])
86293 +
86294 +       C(0); C(1); C(2);
86295 +#undef C
86296 +} 
86297 +
86298 +/*
86299 + * load one particular LDT into the current CPU
86300 + */
86301 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
86302 +{
86303 +       void *segments = pc->ldt;
86304 +       int count = pc->size;
86305 +
86306 +       if (likely(!count))
86307 +               segments = NULL;
86308 +
86309 +       xen_set_ldt((unsigned long)segments, count);
86310 +}
86311 +
86312 +static inline void load_LDT(mm_context_t *pc)
86313 +{
86314 +       int cpu = get_cpu();
86315 +       load_LDT_nolock(pc, cpu);
86316 +       put_cpu();
86317 +}
86318 +
86319 +extern struct desc_ptr idt_descr;
86320 +
86321 +#endif /* !__ASSEMBLY__ */
86322 +
86323 +#endif
86324 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/dma-mapping.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/dma-mapping.h
86325 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/dma-mapping.h  1970-01-01 00:00:00.000000000 +0000
86326 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/dma-mapping.h        2007-02-02 19:10:58.000000000 +0000
86327 @@ -0,0 +1,200 @@
86328 +#ifndef _X8664_DMA_MAPPING_H
86329 +#define _X8664_DMA_MAPPING_H 1
86330 +
86331 +/*
86332 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
86333 + * documentation.
86334 + */
86335 +
86336 +
86337 +#include <asm/scatterlist.h>
86338 +#include <asm/io.h>
86339 +#include <asm/swiotlb.h>
86340 +
86341 +struct dma_mapping_ops {
86342 +       int             (*mapping_error)(dma_addr_t dma_addr);
86343 +       void*           (*alloc_coherent)(struct device *dev, size_t size,
86344 +                                dma_addr_t *dma_handle, gfp_t gfp);
86345 +       void            (*free_coherent)(struct device *dev, size_t size,
86346 +                                void *vaddr, dma_addr_t dma_handle);
86347 +       dma_addr_t      (*map_single)(struct device *hwdev, void *ptr,
86348 +                                size_t size, int direction);
86349 +       /* like map_single, but doesn't check the device mask */
86350 +       dma_addr_t      (*map_simple)(struct device *hwdev, char *ptr,
86351 +                                size_t size, int direction);
86352 +       void            (*unmap_single)(struct device *dev, dma_addr_t addr,
86353 +                               size_t size, int direction);
86354 +       void            (*sync_single_for_cpu)(struct device *hwdev,
86355 +                               dma_addr_t dma_handle, size_t size,
86356 +                               int direction);
86357 +       void            (*sync_single_for_device)(struct device *hwdev,
86358 +                                dma_addr_t dma_handle, size_t size,
86359 +                               int direction);
86360 +       void            (*sync_single_range_for_cpu)(struct device *hwdev,
86361 +                                dma_addr_t dma_handle, unsigned long offset,
86362 +                               size_t size, int direction);
86363 +       void            (*sync_single_range_for_device)(struct device *hwdev,
86364 +                               dma_addr_t dma_handle, unsigned long offset,
86365 +                               size_t size, int direction);
86366 +       void            (*sync_sg_for_cpu)(struct device *hwdev,
86367 +                                struct scatterlist *sg, int nelems,
86368 +                               int direction);
86369 +       void            (*sync_sg_for_device)(struct device *hwdev,
86370 +                               struct scatterlist *sg, int nelems,
86371 +                               int direction);
86372 +       int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
86373 +                               int nents, int direction);
86374 +       void            (*unmap_sg)(struct device *hwdev,
86375 +                               struct scatterlist *sg, int nents,
86376 +                               int direction);
86377 +       int             (*dma_supported)(struct device *hwdev, u64 mask);
86378 +       int             is_phys;
86379 +};
86380 +
86381 +extern dma_addr_t bad_dma_address;
86382 +extern struct dma_mapping_ops* dma_ops;
86383 +extern int iommu_merge;
86384 +
86385 +#if 0
86386 +static inline int dma_mapping_error(dma_addr_t dma_addr)
86387 +{
86388 +       if (dma_ops->mapping_error)
86389 +               return dma_ops->mapping_error(dma_addr);
86390 +
86391 +       return (dma_addr == bad_dma_address);
86392 +}
86393 +
86394 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
86395 +                               dma_addr_t *dma_handle, gfp_t gfp);
86396 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
86397 +                             dma_addr_t dma_handle);
86398 +
86399 +static inline dma_addr_t
86400 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
86401 +              int direction)
86402 +{
86403 +       BUG_ON(!valid_dma_direction(direction));
86404 +       return dma_ops->map_single(hwdev, ptr, size, direction);
86405 +}
86406 +
86407 +static inline void
86408 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
86409 +                int direction)
86410 +{
86411 +       BUG_ON(!valid_dma_direction(direction));
86412 +       dma_ops->unmap_single(dev, addr, size, direction);
86413 +}
86414 +
86415 +#define dma_map_page(dev,page,offset,size,dir) \
86416 +       dma_map_single((dev), page_address(page)+(offset), (size), (dir))
86417 +
86418 +#define dma_unmap_page dma_unmap_single
86419 +
86420 +static inline void
86421 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
86422 +                       size_t size, int direction)
86423 +{
86424 +       BUG_ON(!valid_dma_direction(direction));
86425 +       if (dma_ops->sync_single_for_cpu)
86426 +               dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
86427 +                                            direction);
86428 +       flush_write_buffers();
86429 +}
86430 +
86431 +static inline void
86432 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
86433 +                          size_t size, int direction)
86434 +{
86435 +       BUG_ON(!valid_dma_direction(direction));
86436 +       if (dma_ops->sync_single_for_device)
86437 +               dma_ops->sync_single_for_device(hwdev, dma_handle, size,
86438 +                                               direction);
86439 +       flush_write_buffers();
86440 +}
86441 +
86442 +static inline void
86443 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
86444 +                             unsigned long offset, size_t size, int direction)
86445 +{
86446 +       BUG_ON(!valid_dma_direction(direction));
86447 +       if (dma_ops->sync_single_range_for_cpu) {
86448 +               dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
86449 +       }
86450 +
86451 +       flush_write_buffers();
86452 +}
86453 +
86454 +static inline void
86455 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
86456 +                                unsigned long offset, size_t size, int direction)
86457 +{
86458 +       BUG_ON(!valid_dma_direction(direction));
86459 +       if (dma_ops->sync_single_range_for_device)
86460 +               dma_ops->sync_single_range_for_device(hwdev, dma_handle,
86461 +                                                     offset, size, direction);
86462 +
86463 +       flush_write_buffers();
86464 +}
86465 +
86466 +static inline void
86467 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
86468 +                   int nelems, int direction)
86469 +{
86470 +       BUG_ON(!valid_dma_direction(direction));
86471 +       if (dma_ops->sync_sg_for_cpu)
86472 +               dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
86473 +       flush_write_buffers();
86474 +}
86475 +
86476 +static inline void
86477 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
86478 +                      int nelems, int direction)
86479 +{
86480 +       BUG_ON(!valid_dma_direction(direction));
86481 +       if (dma_ops->sync_sg_for_device) {
86482 +               dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
86483 +       }
86484 +
86485 +       flush_write_buffers();
86486 +}
86487 +
86488 +static inline int
86489 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
86490 +{
86491 +       BUG_ON(!valid_dma_direction(direction));
86492 +       return dma_ops->map_sg(hwdev, sg, nents, direction);
86493 +}
86494 +
86495 +static inline void
86496 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
86497 +            int direction)
86498 +{
86499 +       BUG_ON(!valid_dma_direction(direction));
86500 +       dma_ops->unmap_sg(hwdev, sg, nents, direction);
86501 +}
86502 +
86503 +extern int dma_supported(struct device *hwdev, u64 mask);
86504 +
86505 +/* same for gart, swiotlb, and nommu */
86506 +static inline int dma_get_cache_alignment(void)
86507 +{
86508 +       return boot_cpu_data.x86_clflush_size;
86509 +}
86510 +
86511 +#define dma_is_consistent(h) 1
86512 +
86513 +extern int dma_set_mask(struct device *dev, u64 mask);
86514 +
86515 +static inline void
86516 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
86517 +{
86518 +       flush_write_buffers();
86519 +}
86520 +
86521 +extern struct device fallback_dev;
86522 +#endif
86523 +extern int panic_on_overflow;
86524 +
86525 +#endif /* _X8664_DMA_MAPPING_H */
86526 +
86527 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
86528 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/dmi.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/dmi.h
86529 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/dmi.h  1970-01-01 00:00:00.000000000 +0000
86530 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/dmi.h        2007-02-02 19:10:58.000000000 +0000
86531 @@ -0,0 +1,29 @@
86532 +#ifndef _ASM_DMI_H
86533 +#define _ASM_DMI_H 1
86534 +
86535 +#include <asm/io.h>
86536 +
86537 +extern void *dmi_ioremap(unsigned long addr, unsigned long size);
86538 +extern void dmi_iounmap(void *addr, unsigned long size);
86539 +extern void *bt_ioremap(unsigned long addr, unsigned long size);
86540 +extern void bt_iounmap(void *addr, unsigned long size);
86541 +
86542 +#define DMI_MAX_DATA 2048
86543 +
86544 +extern int dmi_alloc_index;
86545 +extern char dmi_alloc_data[DMI_MAX_DATA];
86546 +
86547 +/* This is so early that there is no good way to allocate dynamic memory. 
86548 +   Allocate data in an BSS array. */
86549 +static inline void *dmi_alloc(unsigned len)
86550 +{
86551 +       int idx = dmi_alloc_index;
86552 +       if ((dmi_alloc_index += len) > DMI_MAX_DATA)
86553 +               return NULL;
86554 +       return dmi_alloc_data + idx;
86555 +}
86556 +
86557 +#define dmi_ioremap bt_ioremap
86558 +#define dmi_iounmap bt_iounmap
86559 +
86560 +#endif
86561 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/e820.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/e820.h
86562 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/e820.h 1970-01-01 00:00:00.000000000 +0000
86563 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/e820.h       2007-02-02 19:10:58.000000000 +0000
86564 @@ -0,0 +1,59 @@
86565 +/*
86566 + * structures and definitions for the int 15, ax=e820 memory map
86567 + * scheme.
86568 + *
86569 + * In a nutshell, setup.S populates a scratch table in the
86570 + * empty_zero_block that contains a list of usable address/size
86571 + * duples.  setup.c, this information is transferred into the e820map,
86572 + * and in init.c/numa.c, that new information is used to mark pages
86573 + * reserved or not.
86574 + */
86575 +#ifndef __E820_HEADER
86576 +#define __E820_HEADER
86577 +
86578 +#include <linux/mmzone.h>
86579 +
86580 +#define E820MAP        0x2d0           /* our map */
86581 +#define E820MAX        128             /* number of entries in E820MAP */
86582 +#define E820NR 0x1e8           /* # entries in E820MAP */
86583 +
86584 +#define E820_RAM       1
86585 +#define E820_RESERVED  2
86586 +#define E820_ACPI      3
86587 +#define E820_NVS       4
86588 +
86589 +#ifndef __ASSEMBLY__
86590 +struct e820entry {
86591 +       u64 addr;       /* start of memory segment */
86592 +       u64 size;       /* size of memory segment */
86593 +       u32 type;       /* type of memory segment */
86594 +} __attribute__((packed));
86595 +
86596 +struct e820map {
86597 +    int nr_map;
86598 +       struct e820entry map[E820MAX];
86599 +};
86600 +
86601 +extern unsigned long find_e820_area(unsigned long start, unsigned long end, 
86602 +                                   unsigned size);
86603 +extern void add_memory_region(unsigned long start, unsigned long size, 
86604 +                             int type);
86605 +extern void setup_memory_region(void);
86606 +extern void contig_e820_setup(void); 
86607 +extern unsigned long e820_end_of_ram(void);
86608 +extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
86609 +extern void e820_mark_nosave_regions(void);
86610 +extern void e820_print_map(char *who);
86611 +extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
86612 +extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
86613 +
86614 +extern void e820_setup_gap(struct e820entry *e820, int nr_map);
86615 +extern void e820_register_active_regions(int nid,
86616 +                               unsigned long start_pfn, unsigned long end_pfn);
86617 +
86618 +extern void finish_e820_parsing(void);
86619 +
86620 +extern struct e820map e820;
86621 +#endif/*!__ASSEMBLY__*/
86622 +
86623 +#endif/*__E820_HEADER*/
86624 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/fixmap.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/fixmap.h
86625 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/fixmap.h       1970-01-01 00:00:00.000000000 +0000
86626 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/fixmap.h     2007-02-02 19:10:58.000000000 +0000
86627 @@ -0,0 +1,108 @@
86628 +/*
86629 + * fixmap.h: compile-time virtual memory allocation
86630 + *
86631 + * This file is subject to the terms and conditions of the GNU General Public
86632 + * License.  See the file "COPYING" in the main directory of this archive
86633 + * for more details.
86634 + *
86635 + * Copyright (C) 1998 Ingo Molnar
86636 + */
86637 +
86638 +#ifndef _ASM_FIXMAP_H
86639 +#define _ASM_FIXMAP_H
86640 +
86641 +#include <linux/kernel.h>
86642 +#include <asm/apicdef.h>
86643 +#include <asm/page.h>
86644 +#include <asm/vsyscall.h>
86645 +#include <asm/vsyscall32.h>
86646 +#include <asm/acpi.h>
86647 +
86648 +/*
86649 + * Here we define all the compile-time 'special' virtual
86650 + * addresses. The point is to have a constant address at
86651 + * compile time, but to set the physical address only
86652 + * in the boot process.
86653 + *
86654 + * these 'compile-time allocated' memory buffers are
86655 + * fixed-size 4k pages. (or larger if used with an increment
86656 + * highger than 1) use fixmap_set(idx,phys) to associate
86657 + * physical memory with fixmap indices.
86658 + *
86659 + * TLB entries of such buffers will not be flushed across
86660 + * task switches.
86661 + */
86662 +
86663 +enum fixed_addresses {
86664 +       VSYSCALL_LAST_PAGE,
86665 +       VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
86666 +       VSYSCALL_HPET,
86667 +       FIX_HPET_BASE,
86668 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
86669 +       FIX_IO_APIC_BASE_0,
86670 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
86671 +#ifdef CONFIG_ACPI
86672 +       FIX_ACPI_BEGIN,
86673 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
86674 +#endif
86675 +       FIX_SHARED_INFO,
86676 +#define NR_FIX_ISAMAPS 256
86677 +       FIX_ISAMAP_END,
86678 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
86679 +       __end_of_permanent_fixed_addresses,
86680 +       /* temporary boot-time mappings, used before ioremap() is functional */
86681 +#define NR_FIX_BTMAPS  16
86682 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
86683 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
86684 +       __end_of_fixed_addresses
86685 +};
86686 +
86687 +extern void __set_fixmap (enum fixed_addresses idx,
86688 +                                       unsigned long phys, pgprot_t flags);
86689 +
86690 +#define set_fixmap(idx, phys) \
86691 +               __set_fixmap(idx, phys, PAGE_KERNEL)
86692 +/*
86693 + * Some hardware wants to get fixmapped without caching.
86694 + */
86695 +#define set_fixmap_nocache(idx, phys) \
86696 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
86697 +
86698 +#define clear_fixmap(idx) \
86699 +                __set_fixmap(idx, 0, __pgprot(0))
86700 +
86701 +#define FIXADDR_TOP    (VSYSCALL_END-PAGE_SIZE)
86702 +#define FIXADDR_SIZE   (__end_of_fixed_addresses << PAGE_SHIFT)
86703 +#define FIXADDR_START  (FIXADDR_TOP - FIXADDR_SIZE)
86704 +
86705 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
86706 +#define FIXADDR_USER_START     ((unsigned long)VSYSCALL32_VSYSCALL)
86707 +#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
86708 +
86709 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
86710 +
86711 +extern void __this_fixmap_does_not_exist(void);
86712 +
86713 +/*
86714 + * 'index to address' translation. If anyone tries to use the idx
86715 + * directly without translation, we catch the bug with a NULL-deference
86716 + * kernel oops. Illegal ranges of incoming indices are caught too.
86717 + */
86718 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
86719 +{
86720 +       /*
86721 +        * this branch gets completely eliminated after inlining,
86722 +        * except when someone tries to use fixaddr indices in an
86723 +        * illegal way. (such as mixing up address types or using
86724 +        * out-of-range indices).
86725 +        *
86726 +        * If it doesn't get removed, the linker will complain
86727 +        * loudly with a reasonably clear error message..
86728 +        */
86729 +       if (idx >= __end_of_fixed_addresses)
86730 +               __this_fixmap_does_not_exist();
86731 +
86732 +        return __fix_to_virt(idx);
86733 +}
86734 +
86735 +#endif
86736 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/floppy.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/floppy.h
86737 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/floppy.h       1970-01-01 00:00:00.000000000 +0000
86738 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/floppy.h     2007-02-02 19:10:58.000000000 +0000
86739 @@ -0,0 +1,207 @@
86740 +/*
86741 + * Architecture specific parts of the Floppy driver
86742 + *
86743 + * This file is subject to the terms and conditions of the GNU General Public
86744 + * License.  See the file "COPYING" in the main directory of this archive
86745 + * for more details.
86746 + *
86747 + * Copyright (C) 1995
86748 + *
86749 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
86750 + */
86751 +#ifndef __ASM_XEN_X86_64_FLOPPY_H
86752 +#define __ASM_XEN_X86_64_FLOPPY_H
86753 +
86754 +#include <linux/vmalloc.h>
86755 +
86756 +
86757 +/*
86758 + * The DMA channel used by the floppy controller cannot access data at
86759 + * addresses >= 16MB
86760 + *
86761 + * Went back to the 1MB limit, as some people had problems with the floppy
86762 + * driver otherwise. It doesn't matter much for performance anyway, as most
86763 + * floppy accesses go through the track buffer.
86764 + */
86765 +#define _CROSS_64KB(a,s,vdma) \
86766 +(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
86767 +
86768 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
86769 +#include <asm/dma.h>
86770 +#undef MAX_DMA_ADDRESS
86771 +#define MAX_DMA_ADDRESS 0
86772 +#define CROSS_64KB(a,s) (0)
86773 +
86774 +#define fd_inb(port)                   inb_p(port)
86775 +#define fd_outb(value,port)            outb_p(value,port)
86776 +
86777 +#define fd_request_dma()        (0)
86778 +#define fd_free_dma()           ((void)0)
86779 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
86780 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
86781 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
86782 +#define fd_get_dma_residue()    vdma_get_dma_residue(FLOPPY_DMA)
86783 +/*
86784 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
86785 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
86786 + */
86787 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size))
86788 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
86789 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
86790 +
86791 +static int virtual_dma_count;
86792 +static int virtual_dma_residue;
86793 +static char *virtual_dma_addr;
86794 +static int virtual_dma_mode;
86795 +static int doing_pdma;
86796 +
86797 +static irqreturn_t floppy_hardint(int irq, void *dev_id)
86798 +{
86799 +       register unsigned char st;
86800 +
86801 +#undef TRACE_FLPY_INT
86802 +
86803 +#ifdef TRACE_FLPY_INT
86804 +       static int calls=0;
86805 +       static int bytes=0;
86806 +       static int dma_wait=0;
86807 +#endif
86808 +       if (!doing_pdma)
86809 +               return floppy_interrupt(irq, dev_id);
86810 +
86811 +#ifdef TRACE_FLPY_INT
86812 +       if(!calls)
86813 +               bytes = virtual_dma_count;
86814 +#endif
86815 +
86816 +       {
86817 +               register int lcount;
86818 +               register char *lptr;
86819 +
86820 +               st = 1;
86821 +               for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
86822 +                   lcount; lcount--, lptr++) {
86823 +                       st=inb(virtual_dma_port+4) & 0xa0 ;
86824 +                       if(st != 0xa0) 
86825 +                               break;
86826 +                       if(virtual_dma_mode)
86827 +                               outb_p(*lptr, virtual_dma_port+5);
86828 +                       else
86829 +                               *lptr = inb_p(virtual_dma_port+5);
86830 +               }
86831 +               virtual_dma_count = lcount;
86832 +               virtual_dma_addr = lptr;
86833 +               st = inb(virtual_dma_port+4);
86834 +       }
86835 +
86836 +#ifdef TRACE_FLPY_INT
86837 +       calls++;
86838 +#endif
86839 +       if(st == 0x20)
86840 +               return IRQ_HANDLED;
86841 +       if(!(st & 0x20)) {
86842 +               virtual_dma_residue += virtual_dma_count;
86843 +               virtual_dma_count=0;
86844 +#ifdef TRACE_FLPY_INT
86845 +               printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", 
86846 +                      virtual_dma_count, virtual_dma_residue, calls, bytes,
86847 +                      dma_wait);
86848 +               calls = 0;
86849 +               dma_wait=0;
86850 +#endif
86851 +               doing_pdma = 0;
86852 +               floppy_interrupt(irq, dev_id);
86853 +               return IRQ_HANDLED;
86854 +       }
86855 +#ifdef TRACE_FLPY_INT
86856 +       if(!virtual_dma_count)
86857 +               dma_wait++;
86858 +#endif
86859 +       return IRQ_HANDLED;
86860 +}
86861 +
86862 +static void fd_disable_dma(void)
86863 +{
86864 +       doing_pdma = 0;
86865 +       virtual_dma_residue += virtual_dma_count;
86866 +       virtual_dma_count=0;
86867 +}
86868 +
86869 +static int vdma_get_dma_residue(unsigned int dummy)
86870 +{
86871 +       return virtual_dma_count + virtual_dma_residue;
86872 +}
86873 +
86874 +
86875 +static int fd_request_irq(void)
86876 +{
86877 +       return request_irq(FLOPPY_IRQ, floppy_hardint,
86878 +                          IRQF_DISABLED, "floppy", NULL);
86879 +}
86880 +
86881 +#if 0
86882 +static unsigned long vdma_mem_alloc(unsigned long size)
86883 +{
86884 +       return (unsigned long) vmalloc(size);
86885 +
86886 +}
86887 +
86888 +static void vdma_mem_free(unsigned long addr, unsigned long size)
86889 +{
86890 +       vfree((void *)addr);
86891 +}
86892 +#endif
86893 +
86894 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
86895 +{
86896 +       doing_pdma = 1;
86897 +       virtual_dma_port = io;
86898 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
86899 +       virtual_dma_addr = addr;
86900 +       virtual_dma_count = size;
86901 +       virtual_dma_residue = 0;
86902 +       return 0;
86903 +}
86904 +
86905 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
86906 +#define FDC1 xen_floppy_init()
86907 +static int FDC2 = -1;
86908 +
86909 +static int xen_floppy_init(void)
86910 +{
86911 +       use_virtual_dma = 1;
86912 +       can_use_virtual_dma = 1;
86913 +       return 0x3f0;
86914 +}
86915 +
86916 +/*
86917 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
86918 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
86919 + * coincides with another rtc CMOS user.               Paul G.
86920 + */
86921 +#define FLOPPY0_TYPE   ({                              \
86922 +       unsigned long flags;                            \
86923 +       unsigned char val;                              \
86924 +       spin_lock_irqsave(&rtc_lock, flags);            \
86925 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
86926 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
86927 +       val;                                            \
86928 +})
86929 +
86930 +#define FLOPPY1_TYPE   ({                              \
86931 +       unsigned long flags;                            \
86932 +       unsigned char val;                              \
86933 +       spin_lock_irqsave(&rtc_lock, flags);            \
86934 +       val = CMOS_READ(0x10) & 15;                     \
86935 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
86936 +       val;                                            \
86937 +})
86938 +
86939 +#define N_FDC 2
86940 +#define N_DRIVE 8
86941 +
86942 +#define FLOPPY_MOTOR_MASK 0xf0
86943 +
86944 +#define EXTRA_FLOPPY_PARAMS
86945 +
86946 +#endif /* __ASM_XEN_X86_64_FLOPPY_H */
86947 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/hw_irq.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/hw_irq.h
86948 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/hw_irq.h       1970-01-01 00:00:00.000000000 +0000
86949 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/hw_irq.h     2007-02-02 19:10:58.000000000 +0000
86950 @@ -0,0 +1,136 @@
86951 +#ifndef _ASM_HW_IRQ_H
86952 +#define _ASM_HW_IRQ_H
86953 +
86954 +/*
86955 + *     linux/include/asm/hw_irq.h
86956 + *
86957 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
86958 + *
86959 + *     moved some of the old arch/i386/kernel/irq.h to here. VY
86960 + *
86961 + *     IRQ/IPI changes taken from work by Thomas Radke
86962 + *     <tomsoft@informatik.tu-chemnitz.de>
86963 + *
86964 + *     hacked by Andi Kleen for x86-64.
86965 + */
86966 +
86967 +#ifndef __ASSEMBLY__
86968 +#include <asm/atomic.h>
86969 +#include <asm/irq.h>
86970 +#include <linux/profile.h>
86971 +#include <linux/smp.h>
86972 +#include <linux/percpu.h>
86973 +#endif
86974 +
86975 +#define NMI_VECTOR             0x02
86976 +/*
86977 + * IDT vectors usable for external interrupt sources start
86978 + * at 0x20:
86979 + */
86980 +#define FIRST_EXTERNAL_VECTOR  0x20
86981 +
86982 +#define IA32_SYSCALL_VECTOR    0x80
86983 +
86984 +
86985 +/*
86986 + * Vectors 0x20-0x2f are used for ISA interrupts.
86987 + */
86988 +
86989 +/*
86990 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
86991 + *
86992 + *  some of the following vectors are 'rare', they are merged
86993 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
86994 + *  TLB, reschedule and local APIC vectors are performance-critical.
86995 + */
86996 +#ifndef CONFIG_XEN
86997 +#define SPURIOUS_APIC_VECTOR   0xff
86998 +#define ERROR_APIC_VECTOR      0xfe
86999 +#define RESCHEDULE_VECTOR      0xfd
87000 +#define CALL_FUNCTION_VECTOR   0xfc
87001 +/* fb free - please don't readd KDB here because it's useless
87002 +   (hint - think what a NMI bit does to a vector) */
87003 +#define THERMAL_APIC_VECTOR    0xfa
87004 +#define THRESHOLD_APIC_VECTOR   0xf9
87005 +/* f8 free */
87006 +#define INVALIDATE_TLB_VECTOR_END      0xf7
87007 +#define INVALIDATE_TLB_VECTOR_START    0xf0    /* f0-f7 used for TLB flush */
87008 +
87009 +#define NUM_INVALIDATE_TLB_VECTORS     8
87010 +#endif
87011 +
87012 +/*
87013 + * Local APIC timer IRQ vector is on a different priority level,
87014 + * to work around the 'lost local interrupt if more than 2 IRQ
87015 + * sources per level' errata.
87016 + */
87017 +#define LOCAL_TIMER_VECTOR     0xef
87018 +
87019 +/*
87020 + * First APIC vector available to drivers: (vectors 0x30-0xee)
87021 + * we start at 0x31 to spread out vectors evenly between priority
87022 + * levels. (0x80 is the syscall vector)
87023 + */
87024 +#define FIRST_DEVICE_VECTOR    0x31
87025 +#define FIRST_SYSTEM_VECTOR    0xef   /* duplicated in irq.h */
87026 +
87027 +
87028 +#ifndef __ASSEMBLY__
87029 +typedef int vector_irq_t[NR_VECTORS];
87030 +DECLARE_PER_CPU(vector_irq_t, vector_irq);
87031 +extern void __setup_vector_irq(int cpu);
87032 +extern spinlock_t vector_lock;
87033 +
87034 +/*
87035 + * Various low-level irq details needed by irq.c, process.c,
87036 + * time.c, io_apic.c and smp.c
87037 + *
87038 + * Interrupt entry/exit code at both C and assembly level
87039 + */
87040 +
87041 +extern void disable_8259A_irq(unsigned int irq);
87042 +extern void enable_8259A_irq(unsigned int irq);
87043 +extern int i8259A_irq_pending(unsigned int irq);
87044 +extern void make_8259A_irq(unsigned int irq);
87045 +extern void init_8259A(int aeoi);
87046 +extern void FASTCALL(send_IPI_self(int vector));
87047 +extern void init_VISWS_APIC_irqs(void);
87048 +extern void setup_IO_APIC(void);
87049 +extern void disable_IO_APIC(void);
87050 +extern void print_IO_APIC(void);
87051 +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
87052 +extern void send_IPI(int dest, int vector);
87053 +extern void setup_ioapic_dest(void);
87054 +
87055 +extern unsigned long io_apic_irqs;
87056 +
87057 +extern atomic_t irq_err_count;
87058 +extern atomic_t irq_mis_count;
87059 +
87060 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
87061 +
87062 +#define __STR(x) #x
87063 +#define STR(x) __STR(x)
87064 +
87065 +#include <asm/ptrace.h>
87066 +
87067 +#define IRQ_NAME2(nr) nr##_interrupt(void)
87068 +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
87069 +
87070 +/*
87071 + *     SMP has a few special interrupts for IPI messages
87072 + */
87073 +
87074 +#define BUILD_IRQ(nr) \
87075 +asmlinkage void IRQ_NAME(nr); \
87076 +__asm__( \
87077 +"\n.p2align\n" \
87078 +"IRQ" #nr "_interrupt:\n\t" \
87079 +       "push $" #nr "-256 ; " \
87080 +       "jmp common_interrupt");
87081 +
87082 +#define platform_legacy_irq(irq)       ((irq) < 16)
87083 +
87084 +#endif
87085 +
87086 +#endif /* _ASM_HW_IRQ_H */
87087 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/hypercall.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/hypercall.h
87088 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/hypercall.h    1970-01-01 00:00:00.000000000 +0000
87089 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/hypercall.h  2007-02-02 19:10:58.000000000 +0000
87090 @@ -0,0 +1,406 @@
87091 +/******************************************************************************
87092 + * hypercall.h
87093 + * 
87094 + * Linux-specific hypervisor handling.
87095 + * 
87096 + * Copyright (c) 2002-2004, K A Fraser
87097 + * 
87098 + * 64-bit updates:
87099 + *   Benjamin Liu <benjamin.liu@intel.com>
87100 + *   Jun Nakajima <jun.nakajima@intel.com>
87101 + * 
87102 + * This program is free software; you can redistribute it and/or
87103 + * modify it under the terms of the GNU General Public License version 2
87104 + * as published by the Free Software Foundation; or, when distributed
87105 + * separately from the Linux kernel or incorporated into other
87106 + * software packages, subject to the following license:
87107 + * 
87108 + * Permission is hereby granted, free of charge, to any person obtaining a copy
87109 + * of this source file (the "Software"), to deal in the Software without
87110 + * restriction, including without limitation the rights to use, copy, modify,
87111 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
87112 + * and to permit persons to whom the Software is furnished to do so, subject to
87113 + * the following conditions:
87114 + * 
87115 + * The above copyright notice and this permission notice shall be included in
87116 + * all copies or substantial portions of the Software.
87117 + * 
87118 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
87119 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87120 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
87121 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
87122 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
87123 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
87124 + * IN THE SOFTWARE.
87125 + */
87126 +
87127 +#ifndef __HYPERCALL_H__
87128 +#define __HYPERCALL_H__
87129 +
87130 +#include <linux/string.h> /* memcpy() */
87131 +
87132 +#ifndef __HYPERVISOR_H__
87133 +# error "please don't include this file directly"
87134 +#endif
87135 +
87136 +#define __STR(x) #x
87137 +#define STR(x) __STR(x)
87138 +
87139 +#ifdef CONFIG_XEN
87140 +#define HYPERCALL_STR(name)                                    \
87141 +       "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
87142 +#else
87143 +#define HYPERCALL_STR(name)                                    \
87144 +       "mov hypercall_stubs,%%rax; "                           \
87145 +       "add $("STR(__HYPERVISOR_##name)" * 32),%%rax; "        \
87146 +       "call *%%rax"
87147 +#endif
87148 +
87149 +#define _hypercall0(type, name)                        \
87150 +({                                             \
87151 +       long __res;                             \
87152 +       asm volatile (                          \
87153 +               HYPERCALL_STR(name)             \
87154 +               : "=a" (__res)                  \
87155 +               :                               \
87156 +               : "memory" );                   \
87157 +       (type)__res;                            \
87158 +})
87159 +
87160 +#define _hypercall1(type, name, a1)                            \
87161 +({                                                             \
87162 +       long __res, __ign1;                                     \
87163 +       asm volatile (                                          \
87164 +               HYPERCALL_STR(name)                             \
87165 +               : "=a" (__res), "=D" (__ign1)                   \
87166 +               : "1" ((long)(a1))                              \
87167 +               : "memory" );                                   \
87168 +       (type)__res;                                            \
87169 +})
87170 +
87171 +#define _hypercall2(type, name, a1, a2)                                \
87172 +({                                                             \
87173 +       long __res, __ign1, __ign2;                             \
87174 +       asm volatile (                                          \
87175 +               HYPERCALL_STR(name)                             \
87176 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2)    \
87177 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
87178 +               : "memory" );                                   \
87179 +       (type)__res;                                            \
87180 +})
87181 +
87182 +#define _hypercall3(type, name, a1, a2, a3)                    \
87183 +({                                                             \
87184 +       long __res, __ign1, __ign2, __ign3;                     \
87185 +       asm volatile (                                          \
87186 +               HYPERCALL_STR(name)                             \
87187 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
87188 +               "=d" (__ign3)                                   \
87189 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
87190 +               "3" ((long)(a3))                                \
87191 +               : "memory" );                                   \
87192 +       (type)__res;                                            \
87193 +})
87194 +
87195 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
87196 +({                                                             \
87197 +       long __res, __ign1, __ign2, __ign3;                     \
87198 +       asm volatile (                                          \
87199 +               "movq %7,%%r10; "                               \
87200 +               HYPERCALL_STR(name)                             \
87201 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
87202 +               "=d" (__ign3)                                   \
87203 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
87204 +               "3" ((long)(a3)), "g" ((long)(a4))              \
87205 +               : "memory", "r10" );                            \
87206 +       (type)__res;                                            \
87207 +})
87208 +
87209 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
87210 +({                                                             \
87211 +       long __res, __ign1, __ign2, __ign3;                     \
87212 +       asm volatile (                                          \
87213 +               "movq %7,%%r10; movq %8,%%r8; "                 \
87214 +               HYPERCALL_STR(name)                             \
87215 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
87216 +               "=d" (__ign3)                                   \
87217 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
87218 +               "3" ((long)(a3)), "g" ((long)(a4)),             \
87219 +               "g" ((long)(a5))                                \
87220 +               : "memory", "r10", "r8" );                      \
87221 +       (type)__res;                                            \
87222 +})
87223 +
87224 +static inline int
87225 +HYPERVISOR_set_trap_table(
87226 +       trap_info_t *table)
87227 +{
87228 +       return _hypercall1(int, set_trap_table, table);
87229 +}
87230 +
87231 +static inline int
87232 +HYPERVISOR_mmu_update(
87233 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
87234 +{
87235 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
87236 +}
87237 +
87238 +static inline int
87239 +HYPERVISOR_mmuext_op(
87240 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
87241 +{
87242 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
87243 +}
87244 +
87245 +static inline int
87246 +HYPERVISOR_set_gdt(
87247 +       unsigned long *frame_list, int entries)
87248 +{
87249 +       return _hypercall2(int, set_gdt, frame_list, entries);
87250 +}
87251 +
87252 +static inline int
87253 +HYPERVISOR_stack_switch(
87254 +       unsigned long ss, unsigned long esp)
87255 +{
87256 +       return _hypercall2(int, stack_switch, ss, esp);
87257 +}
87258 +
87259 +static inline int
87260 +HYPERVISOR_set_callbacks(
87261 +       unsigned long event_address, unsigned long failsafe_address, 
87262 +       unsigned long syscall_address)
87263 +{
87264 +       return _hypercall3(int, set_callbacks,
87265 +                          event_address, failsafe_address, syscall_address);
87266 +}
87267 +
87268 +static inline int
87269 +HYPERVISOR_fpu_taskswitch(
87270 +       int set)
87271 +{
87272 +       return _hypercall1(int, fpu_taskswitch, set);
87273 +}
87274 +
87275 +static inline int
87276 +HYPERVISOR_sched_op_compat(
87277 +       int cmd, unsigned long arg)
87278 +{
87279 +       return _hypercall2(int, sched_op_compat, cmd, arg);
87280 +}
87281 +
87282 +static inline int
87283 +HYPERVISOR_sched_op(
87284 +       int cmd, void *arg)
87285 +{
87286 +       return _hypercall2(int, sched_op, cmd, arg);
87287 +}
87288 +
87289 +static inline long
87290 +HYPERVISOR_set_timer_op(
87291 +       u64 timeout)
87292 +{
87293 +       return _hypercall1(long, set_timer_op, timeout);
87294 +}
87295 +
87296 +static inline int
87297 +HYPERVISOR_dom0_op(
87298 +       dom0_op_t *dom0_op)
87299 +{
87300 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
87301 +       return _hypercall1(int, dom0_op, dom0_op);
87302 +}
87303 +
87304 +static inline int
87305 +HYPERVISOR_set_debugreg(
87306 +       int reg, unsigned long value)
87307 +{
87308 +       return _hypercall2(int, set_debugreg, reg, value);
87309 +}
87310 +
87311 +static inline unsigned long
87312 +HYPERVISOR_get_debugreg(
87313 +       int reg)
87314 +{
87315 +       return _hypercall1(unsigned long, get_debugreg, reg);
87316 +}
87317 +
87318 +static inline int
87319 +HYPERVISOR_update_descriptor(
87320 +       unsigned long ma, unsigned long word)
87321 +{
87322 +       return _hypercall2(int, update_descriptor, ma, word);
87323 +}
87324 +
87325 +static inline int
87326 +HYPERVISOR_memory_op(
87327 +       unsigned int cmd, void *arg)
87328 +{
87329 +       return _hypercall2(int, memory_op, cmd, arg);
87330 +}
87331 +
87332 +static inline int
87333 +HYPERVISOR_multicall(
87334 +       void *call_list, int nr_calls)
87335 +{
87336 +       return _hypercall2(int, multicall, call_list, nr_calls);
87337 +}
87338 +
87339 +static inline int
87340 +HYPERVISOR_update_va_mapping(
87341 +       unsigned long va, pte_t new_val, unsigned long flags)
87342 +{
87343 +       return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
87344 +}
87345 +
87346 +static inline int
87347 +HYPERVISOR_event_channel_op(
87348 +       int cmd, void *arg)
87349 +{
87350 +       int rc = _hypercall2(int, event_channel_op, cmd, arg);
87351 +
87352 +#ifdef CONFIG_XEN_COMPAT_030002
87353 +       if (unlikely(rc == -ENOSYS)) {
87354 +               struct evtchn_op op;
87355 +               op.cmd = cmd;
87356 +               memcpy(&op.u, arg, sizeof(op.u));
87357 +               rc = _hypercall1(int, event_channel_op_compat, &op);
87358 +               memcpy(arg, &op.u, sizeof(op.u));
87359 +       }
87360 +#endif
87361 +
87362 +       return rc;
87363 +}
87364 +
87365 +static inline int
87366 +HYPERVISOR_acm_op(
87367 +       int cmd, void *arg)
87368 +{
87369 +       return _hypercall2(int, acm_op, cmd, arg);
87370 +}
87371 +
87372 +static inline int
87373 +HYPERVISOR_xen_version(
87374 +       int cmd, void *arg)
87375 +{
87376 +       return _hypercall2(int, xen_version, cmd, arg);
87377 +}
87378 +
87379 +static inline int
87380 +HYPERVISOR_console_io(
87381 +       int cmd, int count, char *str)
87382 +{
87383 +       return _hypercall3(int, console_io, cmd, count, str);
87384 +}
87385 +
87386 +static inline int
87387 +HYPERVISOR_physdev_op(
87388 +       int cmd, void *arg)
87389 +{
87390 +       int rc = _hypercall2(int, physdev_op, cmd, arg);
87391 +
87392 +#ifdef CONFIG_XEN_COMPAT_030002
87393 +       if (unlikely(rc == -ENOSYS)) {
87394 +               struct physdev_op op;
87395 +               op.cmd = cmd;
87396 +               memcpy(&op.u, arg, sizeof(op.u));
87397 +               rc = _hypercall1(int, physdev_op_compat, &op);
87398 +               memcpy(arg, &op.u, sizeof(op.u));
87399 +       }
87400 +#endif
87401 +
87402 +       return rc;
87403 +}
87404 +
87405 +static inline int
87406 +HYPERVISOR_grant_table_op(
87407 +       unsigned int cmd, void *uop, unsigned int count)
87408 +{
87409 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
87410 +}
87411 +
87412 +static inline int
87413 +HYPERVISOR_update_va_mapping_otherdomain(
87414 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
87415 +{
87416 +       return _hypercall4(int, update_va_mapping_otherdomain, va,
87417 +                          new_val.pte, flags, domid);
87418 +}
87419 +
87420 +static inline int
87421 +HYPERVISOR_vm_assist(
87422 +       unsigned int cmd, unsigned int type)
87423 +{
87424 +       return _hypercall2(int, vm_assist, cmd, type);
87425 +}
87426 +
87427 +static inline int
87428 +HYPERVISOR_vcpu_op(
87429 +       int cmd, int vcpuid, void *extra_args)
87430 +{
87431 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
87432 +}
87433 +
87434 +static inline int
87435 +HYPERVISOR_set_segment_base(
87436 +       int reg, unsigned long value)
87437 +{
87438 +       return _hypercall2(int, set_segment_base, reg, value);
87439 +}
87440 +
87441 +static inline int
87442 +HYPERVISOR_suspend(
87443 +       unsigned long srec)
87444 +{
87445 +       struct sched_shutdown sched_shutdown = {
87446 +               .reason = SHUTDOWN_suspend
87447 +       };
87448 +
87449 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
87450 +                            &sched_shutdown, srec);
87451 +
87452 +#ifdef CONFIG_XEN_COMPAT_030002
87453 +       if (rc == -ENOSYS)
87454 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
87455 +                                SHUTDOWN_suspend, srec);
87456 +#endif
87457 +
87458 +       return rc;
87459 +}
87460 +
87461 +static inline int
87462 +HYPERVISOR_nmi_op(
87463 +       unsigned long op, void *arg)
87464 +{
87465 +       return _hypercall2(int, nmi_op, op, arg);
87466 +}
87467 +
87468 +static inline unsigned long
87469 +HYPERVISOR_hvm_op(
87470 +    int op, void *arg)
87471 +{
87472 +    return _hypercall2(unsigned long, hvm_op, op, arg);
87473 +}
87474 +
87475 +static inline int
87476 +HYPERVISOR_callback_op(
87477 +       int cmd, void *arg)
87478 +{
87479 +       return _hypercall2(int, callback_op, cmd, arg);
87480 +}
87481 +
87482 +static inline int
87483 +HYPERVISOR_xenoprof_op(
87484 +       int op, void *arg)
87485 +{
87486 +       return _hypercall2(int, xenoprof_op, op, arg);
87487 +}
87488 +
87489 +static inline int
87490 +HYPERVISOR_kexec_op(
87491 +       unsigned long op, void *args)
87492 +{
87493 +       return _hypercall2(int, kexec_op, op, args);
87494 +}
87495 +
87496 +#endif /* __HYPERCALL_H__ */
87497 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/hypervisor.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/hypervisor.h
87498 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/hypervisor.h   1970-01-01 00:00:00.000000000 +0000
87499 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/hypervisor.h 2007-02-02 19:10:58.000000000 +0000
87500 @@ -0,0 +1,2 @@
87501 +
87502 +#include <asm-i386/mach-xen/asm/hypervisor.h>
87503 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/io.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/io.h
87504 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/io.h   1970-01-01 00:00:00.000000000 +0000
87505 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/io.h 2007-02-02 19:10:58.000000000 +0000
87506 @@ -0,0 +1,301 @@
87507 +#ifndef _ASM_IO_H
87508 +#define _ASM_IO_H
87509 +
87510 +#include <asm/fixmap.h>
87511 +
87512 +/*
87513 + * This file contains the definitions for the x86 IO instructions
87514 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
87515 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
87516 + * versions of the single-IO instructions (inb_p/inw_p/..).
87517 + *
87518 + * This file is not meant to be obfuscating: it's just complicated
87519 + * to (a) handle it all in a way that makes gcc able to optimize it
87520 + * as well as possible and (b) trying to avoid writing the same thing
87521 + * over and over again with slight variations and possibly making a
87522 + * mistake somewhere.
87523 + */
87524 +
87525 +/*
87526 + * Thanks to James van Artsdalen for a better timing-fix than
87527 + * the two short jumps: using outb's to a nonexistent port seems
87528 + * to guarantee better timings even on fast machines.
87529 + *
87530 + * On the other hand, I'd like to be sure of a non-existent port:
87531 + * I feel a bit unsafe about using 0x80 (should be safe, though)
87532 + *
87533 + *             Linus
87534 + */
87535 +
87536 + /*
87537 +  *  Bit simplified and optimized by Jan Hubicka
87538 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
87539 +  *
87540 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
87541 +  *  isa_read[wl] and isa_write[wl] fixed
87542 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
87543 +  */
87544 +
87545 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
87546 +
87547 +#ifdef REALLY_SLOW_IO
87548 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
87549 +#else
87550 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
87551 +#endif
87552 +
87553 +/*
87554 + * Talk about misusing macros..
87555 + */
87556 +#define __OUT1(s,x) \
87557 +static inline void out##s(unsigned x value, unsigned short port) {
87558 +
87559 +#define __OUT2(s,s1,s2) \
87560 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
87561 +
87562 +#define __OUT(s,s1,x) \
87563 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
87564 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
87565 +
87566 +#define __IN1(s) \
87567 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
87568 +
87569 +#define __IN2(s,s1,s2) \
87570 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
87571 +
87572 +#define __IN(s,s1,i...) \
87573 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
87574 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
87575 +
87576 +#define __INS(s) \
87577 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
87578 +{ __asm__ __volatile__ ("rep ; ins" #s \
87579 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
87580 +
87581 +#define __OUTS(s) \
87582 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
87583 +{ __asm__ __volatile__ ("rep ; outs" #s \
87584 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
87585 +
87586 +#define RETURN_TYPE unsigned char
87587 +__IN(b,"")
87588 +#undef RETURN_TYPE
87589 +#define RETURN_TYPE unsigned short
87590 +__IN(w,"")
87591 +#undef RETURN_TYPE
87592 +#define RETURN_TYPE unsigned int
87593 +__IN(l,"")
87594 +#undef RETURN_TYPE
87595 +
87596 +__OUT(b,"b",char)
87597 +__OUT(w,"w",short)
87598 +__OUT(l,,int)
87599 +
87600 +__INS(b)
87601 +__INS(w)
87602 +__INS(l)
87603 +
87604 +__OUTS(b)
87605 +__OUTS(w)
87606 +__OUTS(l)
87607 +
87608 +#define IO_SPACE_LIMIT 0xffff
87609 +
87610 +#if defined(__KERNEL__) && __x86_64__
87611 +
87612 +#include <linux/vmalloc.h>
87613 +
87614 +#ifndef __i386__
87615 +/*
87616 + * Change virtual addresses to physical addresses and vv.
87617 + * These are pretty trivial
87618 + */
87619 +static inline unsigned long virt_to_phys(volatile void * address)
87620 +{
87621 +       return __pa(address);
87622 +}
87623 +
87624 +static inline void * phys_to_virt(unsigned long address)
87625 +{
87626 +       return __va(address);
87627 +}
87628 +
87629 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
87630 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
87631 +#endif
87632 +
87633 +/*
87634 + * Change "struct page" to physical address.
87635 + */
87636 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
87637 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
87638 +#define page_to_bus(page)       (phys_to_machine(page_to_pseudophys(page)))
87639 +
87640 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
87641 +                                 (unsigned long) bio_offset((bio)))
87642 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
87643 +                                 (unsigned long) (bv)->bv_offset)
87644 +
87645 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
87646 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
87647 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
87648 +         bvec_to_pseudophys((vec2))))
87649 +
87650 +#include <asm-generic/iomap.h>
87651 +
87652 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
87653 +
87654 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
87655 +{
87656 +       return __ioremap(offset, size, 0);
87657 +}
87658 +
87659 +extern void *early_ioremap(unsigned long addr, unsigned long size);
87660 +extern void early_iounmap(void *addr, unsigned long size);
87661 +
87662 +/*
87663 + * This one maps high address device memory and turns off caching for that area.
87664 + * it's useful if some control registers are in such an area and write combining
87665 + * or read caching is not desirable:
87666 + */
87667 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
87668 +extern void iounmap(volatile void __iomem *addr);
87669 +
87670 +/*
87671 + * ISA I/O bus memory addresses are 1:1 with the physical address.
87672 + */
87673 +
87674 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
87675 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
87676 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
87677 +
87678 +/*
87679 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
87680 + * are forbidden in portable PCI drivers.
87681 + *
87682 + * Allow them on x86 for legacy drivers, though.
87683 + */
87684 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
87685 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
87686 +
87687 +/*
87688 + * readX/writeX() are used to access memory mapped devices. On some
87689 + * architectures the memory mapped IO stuff needs to be accessed
87690 + * differently. On the x86 architecture, we just read/write the
87691 + * memory location directly.
87692 + */
87693 +
87694 +static inline __u8 __readb(const volatile void __iomem *addr)
87695 +{
87696 +       return *(__force volatile __u8 *)addr;
87697 +}
87698 +static inline __u16 __readw(const volatile void __iomem *addr)
87699 +{
87700 +       return *(__force volatile __u16 *)addr;
87701 +}
87702 +static __always_inline __u32 __readl(const volatile void __iomem *addr)
87703 +{
87704 +       return *(__force volatile __u32 *)addr;
87705 +}
87706 +static inline __u64 __readq(const volatile void __iomem *addr)
87707 +{
87708 +       return *(__force volatile __u64 *)addr;
87709 +}
87710 +#define readb(x) __readb(x)
87711 +#define readw(x) __readw(x)
87712 +#define readl(x) __readl(x)
87713 +#define readq(x) __readq(x)
87714 +#define readb_relaxed(a) readb(a)
87715 +#define readw_relaxed(a) readw(a)
87716 +#define readl_relaxed(a) readl(a)
87717 +#define readq_relaxed(a) readq(a)
87718 +#define __raw_readb readb
87719 +#define __raw_readw readw
87720 +#define __raw_readl readl
87721 +#define __raw_readq readq
87722 +
87723 +#define mmiowb()
87724 +
87725 +static inline void __writel(__u32 b, volatile void __iomem *addr)
87726 +{
87727 +       *(__force volatile __u32 *)addr = b;
87728 +}
87729 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
87730 +{
87731 +       *(__force volatile __u64 *)addr = b;
87732 +}
87733 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
87734 +{
87735 +       *(__force volatile __u8 *)addr = b;
87736 +}
87737 +static inline void __writew(__u16 b, volatile void __iomem *addr)
87738 +{
87739 +       *(__force volatile __u16 *)addr = b;
87740 +}
87741 +#define writeq(val,addr) __writeq((val),(addr))
87742 +#define writel(val,addr) __writel((val),(addr))
87743 +#define writew(val,addr) __writew((val),(addr))
87744 +#define writeb(val,addr) __writeb((val),(addr))
87745 +#define __raw_writeb writeb
87746 +#define __raw_writew writew
87747 +#define __raw_writel writel
87748 +#define __raw_writeq writeq
87749 +
87750 +void __memcpy_fromio(void*,unsigned long,unsigned);
87751 +void __memcpy_toio(unsigned long,const void*,unsigned);
87752 +
87753 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
87754 +{
87755 +       __memcpy_fromio(to,(unsigned long)from,len);
87756 +}
87757 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
87758 +{
87759 +       __memcpy_toio((unsigned long)to,from,len);
87760 +}
87761 +
87762 +void memset_io(volatile void __iomem *a, int b, size_t c);
87763 +
87764 +/*
87765 + * ISA space is 'always mapped' on a typical x86 system, no need to
87766 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
87767 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
87768 + * are physical addresses. The following constant pointer can be
87769 + * used as the IO-area pointer (it can be iounmapped as well, so the
87770 + * analogy with PCI is quite large):
87771 + */
87772 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
87773 +
87774 +/*
87775 + * Again, x86-64 does not require mem IO specific function.
87776 + */
87777 +
87778 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void *)(b),(c),(d))
87779 +
87780 +/* Nothing to do */
87781 +
87782 +#define dma_cache_inv(_start,_size)            do { } while (0)
87783 +#define dma_cache_wback(_start,_size)          do { } while (0)
87784 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
87785 +
87786 +#define flush_write_buffers() 
87787 +
87788 +extern int iommu_bio_merge;
87789 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
87790 +
87791 +/*
87792 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
87793 + * access
87794 + */
87795 +#define xlate_dev_mem_ptr(p, sz)       ioremap(p, sz)
87796 +#define xlate_dev_mem_ptr_unmap(p)     iounmap(p)
87797 +
87798 +/*
87799 + * Convert a virtual cached pointer to an uncached pointer
87800 + */
87801 +#define xlate_dev_kmem_ptr(p)  p
87802 +
87803 +#endif /* __KERNEL__ */
87804 +
87805 +#define ARCH_HAS_DEV_MEM
87806 +
87807 +#endif
87808 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/irq.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/irq.h
87809 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/irq.h  1970-01-01 00:00:00.000000000 +0000
87810 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/irq.h        2007-02-02 19:10:58.000000000 +0000
87811 @@ -0,0 +1,34 @@
87812 +#ifndef _ASM_IRQ_H
87813 +#define _ASM_IRQ_H
87814 +
87815 +/*
87816 + *     linux/include/asm/irq.h
87817 + *
87818 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
87819 + *
87820 + *     IRQ/IPI changes taken from work by Thomas Radke
87821 + *     <tomsoft@informatik.tu-chemnitz.de>
87822 + */
87823 +
87824 +#include <linux/sched.h>
87825 +/* include comes from machine specific directory */
87826 +#include "irq_vectors.h"
87827 +#include <asm/thread_info.h>
87828 +
87829 +static __inline__ int irq_canonicalize(int irq)
87830 +{
87831 +       return ((irq == 2) ? 9 : irq);
87832 +}
87833 +
87834 +#define ARCH_HAS_NMI_WATCHDOG          /* See include/linux/nmi.h */
87835 +
87836 +# define irq_ctx_init(cpu) do { } while (0)
87837 +
87838 +#ifdef CONFIG_HOTPLUG_CPU
87839 +#include <linux/cpumask.h>
87840 +extern void fixup_irqs(cpumask_t map);
87841 +#endif
87842 +
87843 +#define __ARCH_HAS_DO_SOFTIRQ 1
87844 +
87845 +#endif /* _ASM_IRQ_H */
87846 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/irqflags.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/irqflags.h
87847 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/irqflags.h     1970-01-01 00:00:00.000000000 +0000
87848 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/irqflags.h   2007-02-02 19:10:58.000000000 +0000
87849 @@ -0,0 +1,65 @@
87850 +/*
87851 + * include/asm-x86_64/irqflags.h
87852 + *
87853 + * IRQ flags handling
87854 + *
87855 + * This file gets included from lowlevel asm headers too, to provide
87856 + * wrapped versions of the local_irq_*() APIs, based on the
87857 + * raw_local_irq_*() functions from the lowlevel headers.
87858 + */
87859 +#ifndef _ASM_IRQFLAGS_H
87860 +#define _ASM_IRQFLAGS_H
87861 +
87862 +#ifndef __ASSEMBLY__
87863 +/*
87864 + * Interrupt control:
87865 + */
87866 +
87867 +unsigned long __raw_local_save_flags(void);
87868 +#define raw_local_save_flags(flags) \
87869 +               do { (flags) = __raw_local_save_flags(); } while (0)
87870 +
87871 +void raw_local_irq_restore(unsigned long flags);
87872 +void raw_local_irq_disable(void);
87873 +void raw_local_irq_enable(void);
87874 +
87875 +static inline int raw_irqs_disabled_flags(unsigned long flags)
87876 +{
87877 +       return flags != 0;
87878 +}
87879 +
87880 +/*
87881 + * For spinlocks, etc.:
87882 + */
87883 +
87884 +unsigned long __raw_local_irq_save(void);
87885 +
87886 +#define raw_local_irq_save(flags) \
87887 +               do { (flags) = __raw_local_irq_save(); } while (0)
87888 +
87889 +int raw_irqs_disabled(void);
87890 +
87891 +/*
87892 + * Used in the idle loop; sti takes one instruction cycle
87893 + * to complete:
87894 + */
87895 +void raw_safe_halt(void);
87896 +
87897 +
87898 +/*
87899 + * Used when interrupts are already enabled or to
87900 + * shutdown the processor:
87901 + */
87902 +void halt(void);
87903 +
87904 +#else /* __ASSEMBLY__: */
87905 +# ifdef CONFIG_TRACE_IRQFLAGS
87906 +#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk
87907 +#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk
87908 +# else
87909 +#  define TRACE_IRQS_ON
87910 +#  define TRACE_IRQS_OFF
87911 +# endif
87912 +#endif
87913 +
87914 +#endif
87915 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/maddr.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/maddr.h
87916 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/maddr.h        1970-01-01 00:00:00.000000000 +0000
87917 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/maddr.h      2007-02-02 19:10:58.000000000 +0000
87918 @@ -0,0 +1,150 @@
87919 +#ifndef _X86_64_MADDR_H
87920 +#define _X86_64_MADDR_H
87921 +
87922 +#include <xen/features.h>
87923 +#include <xen/interface/xen.h>
87924 +
87925 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
87926 +#define INVALID_P2M_ENTRY      (~0UL)
87927 +#define FOREIGN_FRAME_BIT      (1UL<<63)
87928 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
87929 +
87930 +/* Definitions for machine and pseudophysical addresses. */
87931 +typedef unsigned long paddr_t;
87932 +typedef unsigned long maddr_t;
87933 +
87934 +#ifdef CONFIG_XEN
87935 +
87936 +extern unsigned long *phys_to_machine_mapping;
87937 +
87938 +#undef machine_to_phys_mapping
87939 +extern unsigned long *machine_to_phys_mapping;
87940 +extern unsigned int   machine_to_phys_order;
87941 +
87942 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
87943 +{
87944 +       if (xen_feature(XENFEAT_auto_translated_physmap))
87945 +               return pfn;
87946 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
87947 +               ~FOREIGN_FRAME_BIT;
87948 +}
87949 +
87950 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
87951 +{
87952 +       if (xen_feature(XENFEAT_auto_translated_physmap))
87953 +               return 1;
87954 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
87955 +}
87956 +
87957 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
87958 +{
87959 +       unsigned long pfn;
87960 +
87961 +       if (xen_feature(XENFEAT_auto_translated_physmap))
87962 +               return mfn;
87963 +
87964 +       if (unlikely((mfn >> machine_to_phys_order) != 0))
87965 +               return end_pfn;
87966 +
87967 +       /* The array access can fail (e.g., device space beyond end of RAM). */
87968 +       asm (
87969 +               "1:     movq %1,%0\n"
87970 +               "2:\n"
87971 +               ".section .fixup,\"ax\"\n"
87972 +               "3:     movq %2,%0\n"
87973 +               "       jmp  2b\n"
87974 +               ".previous\n"
87975 +               ".section __ex_table,\"a\"\n"
87976 +               "       .align 8\n"
87977 +               "       .quad 1b,3b\n"
87978 +               ".previous"
87979 +               : "=r" (pfn)
87980 +               : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
87981 +
87982 +       return pfn;
87983 +}
87984 +
87985 +/*
87986 + * We detect special mappings in one of two ways:
87987 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
87988 + *     to be outside our maximum possible pseudophys range.
87989 + *  2. If the MFN belongs to a different domain then we will certainly
87990 + *     not have MFN in our p2m table. Conversely, if the page is ours,
87991 + *     then we'll have p2m(m2p(MFN))==MFN.
87992 + * If we detect a special mapping then it doesn't have a 'struct page'.
87993 + * We force !pfn_valid() by returning an out-of-range pointer.
87994 + *
87995 + * NB. These checks require that, for any MFN that is not in our reservation,
87996 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
87997 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
87998 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
87999 + *
88000 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
88001 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
88002 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
88003 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
88004 + */
88005 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
88006 +{
88007 +       unsigned long pfn = mfn_to_pfn(mfn);
88008 +       if ((pfn < end_pfn)
88009 +           && !xen_feature(XENFEAT_auto_translated_physmap)
88010 +           && (phys_to_machine_mapping[pfn] != mfn))
88011 +               return end_pfn; /* force !pfn_valid() */
88012 +       return pfn;
88013 +}
88014 +
88015 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
88016 +{
88017 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
88018 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
88019 +               return;
88020 +       }
88021 +       phys_to_machine_mapping[pfn] = mfn;
88022 +}
88023 +
88024 +static inline maddr_t phys_to_machine(paddr_t phys)
88025 +{
88026 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
88027 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
88028 +       return machine;
88029 +}
88030 +
88031 +static inline paddr_t machine_to_phys(maddr_t machine)
88032 +{
88033 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
88034 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
88035 +       return phys;
88036 +}
88037 +
88038 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
88039 +{
88040 +       paddr_t phys;
88041 +       phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
88042 +       phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
88043 +       return phys;
88044 +}
88045 +
88046 +#else /* !CONFIG_XEN */
88047 +
88048 +#define pfn_to_mfn(pfn) (pfn)
88049 +#define mfn_to_pfn(mfn) (mfn)
88050 +#define mfn_to_local_pfn(mfn) (mfn)
88051 +#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn))
88052 +#define phys_to_machine_mapping_valid(pfn) (1)
88053 +#define phys_to_machine(phys) ((maddr_t)(phys))
88054 +#define machine_to_phys(mach) ((paddr_t)(mach))
88055 +#define pte_machine_to_phys(mach) ((paddr_t)(mach))
88056 +
88057 +#endif /* !CONFIG_XEN */
88058 +
88059 +/* VIRT <-> MACHINE conversion */
88060 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
88061 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
88062 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
88063 +
88064 +#define __pte_ma(x)     ((pte_t) { (x) } )
88065 +#define pfn_pte_ma(pfn, prot)  __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
88066 +
88067 +#endif /* _X86_64_MADDR_H */
88068 +
88069 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/mmu.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/mmu.h
88070 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/mmu.h  1970-01-01 00:00:00.000000000 +0000
88071 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/mmu.h        2007-02-02 19:10:58.000000000 +0000
88072 @@ -0,0 +1,38 @@
88073 +#ifndef __x86_64_MMU_H
88074 +#define __x86_64_MMU_H
88075 +
88076 +#include <linux/spinlock.h>
88077 +#include <asm/semaphore.h>
88078 +
88079 +/*
88080 + * The x86_64 doesn't have a mmu context, but
88081 + * we put the segment information here.
88082 + *
88083 + * cpu_vm_mask is used to optimize ldt flushing.
88084 + */
88085 +typedef struct { 
88086 +       void *ldt;
88087 +       rwlock_t ldtlock; 
88088 +       int size;
88089 +       struct semaphore sem; 
88090 +#ifdef CONFIG_XEN
88091 +       unsigned pinned:1;
88092 +       unsigned has_foreign_mappings:1;
88093 +       struct list_head unpinned;
88094 +#endif
88095 +} mm_context_t;
88096 +
88097 +#ifdef CONFIG_XEN
88098 +extern struct list_head mm_unpinned;
88099 +extern spinlock_t mm_unpinned_lock;
88100 +
88101 +/* mm/memory.c:exit_mmap hook */
88102 +extern void _arch_exit_mmap(struct mm_struct *mm);
88103 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
88104 +
88105 +/* kernel/fork.c:dup_mmap hook */
88106 +extern void _arch_dup_mmap(struct mm_struct *mm);
88107 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
88108 +#endif
88109 +
88110 +#endif
88111 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/mmu_context.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/mmu_context.h
88112 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/mmu_context.h  1970-01-01 00:00:00.000000000 +0000
88113 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/mmu_context.h        2007-02-02 19:10:58.000000000 +0000
88114 @@ -0,0 +1,135 @@
88115 +#ifndef __X86_64_MMU_CONTEXT_H
88116 +#define __X86_64_MMU_CONTEXT_H
88117 +
88118 +#include <asm/desc.h>
88119 +#include <asm/atomic.h>
88120 +#include <asm/pgalloc.h>
88121 +#include <asm/page.h>
88122 +#include <asm/pda.h>
88123 +#include <asm/pgtable.h>
88124 +#include <asm/tlbflush.h>
88125 +
88126 +/*
88127 + * possibly do the LDT unload here?
88128 + */
88129 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
88130 +void destroy_context(struct mm_struct *mm);
88131 +
88132 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
88133 +{
88134 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
88135 +       if (read_pda(mmu_state) == TLBSTATE_OK) 
88136 +               write_pda(mmu_state, TLBSTATE_LAZY);
88137 +#endif
88138 +}
88139 +
88140 +#define prepare_arch_switch(next)      __prepare_arch_switch()
88141 +
88142 +static inline void __prepare_arch_switch(void)
88143 +{
88144 +       /*
88145 +        * Save away %es, %ds, %fs and %gs. Must happen before reload
88146 +        * of cr3/ldt (i.e., not in __switch_to).
88147 +        */
88148 +       __asm__ __volatile__ (
88149 +               "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
88150 +               : "=m" (current->thread.es),
88151 +                 "=m" (current->thread.ds),
88152 +                 "=m" (current->thread.fsindex),
88153 +                 "=m" (current->thread.gsindex) );
88154 +
88155 +       if (current->thread.ds)
88156 +               __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
88157 +
88158 +       if (current->thread.es)
88159 +               __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
88160 +
88161 +       if (current->thread.fsindex) {
88162 +               __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
88163 +               current->thread.fs = 0;
88164 +       }
88165 +
88166 +       if (current->thread.gsindex) {
88167 +               load_gs_index(0);
88168 +               current->thread.gs = 0;
88169 +       }
88170 +}
88171 +
88172 +extern void mm_pin(struct mm_struct *mm);
88173 +extern void mm_unpin(struct mm_struct *mm);
88174 +void mm_pin_all(void);
88175 +
88176 +static inline void load_cr3(pgd_t *pgd)
88177 +{
88178 +       asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
88179 +                    "memory");
88180 +}
88181 +
88182 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
88183 +                            struct task_struct *tsk)
88184 +{
88185 +       unsigned cpu = smp_processor_id();
88186 +       struct mmuext_op _op[3], *op = _op;
88187 +
88188 +       if (likely(prev != next)) {
88189 +               BUG_ON(!next->context.pinned);
88190 +
88191 +               /* stop flush ipis for the previous mm */
88192 +               cpu_clear(cpu, prev->cpu_vm_mask);
88193 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
88194 +               write_pda(mmu_state, TLBSTATE_OK);
88195 +               write_pda(active_mm, next);
88196 +#endif
88197 +               cpu_set(cpu, next->cpu_vm_mask);
88198 +
88199 +               /* load_cr3(next->pgd) */
88200 +               op->cmd = MMUEXT_NEW_BASEPTR;
88201 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
88202 +               op++;
88203 +
88204 +               /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
88205 +               op->cmd = MMUEXT_NEW_USER_BASEPTR;
88206 +               op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
88207 +               op++;
88208 +               
88209 +               if (unlikely(next->context.ldt != prev->context.ldt)) {
88210 +                       /* load_LDT_nolock(&next->context, cpu) */
88211 +                       op->cmd = MMUEXT_SET_LDT;
88212 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
88213 +                       op->arg2.nr_ents     = next->context.size;
88214 +                       op++;
88215 +               }
88216 +
88217 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
88218 +       }
88219 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
88220 +       else {
88221 +               write_pda(mmu_state, TLBSTATE_OK);
88222 +               if (read_pda(active_mm) != next)
88223 +                       out_of_line_bug();
88224 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
88225 +                       /* We were in lazy tlb mode and leave_mm disabled 
88226 +                        * tlb flush IPI delivery. We must reload CR3
88227 +                        * to make sure to use no freed page tables.
88228 +                        */
88229 +                        load_cr3(next->pgd);
88230 +                        xen_new_user_pt(__pa(__user_pgd(next->pgd)));          
88231 +                       load_LDT_nolock(&next->context, cpu);
88232 +               }
88233 +       }
88234 +#endif
88235 +}
88236 +
88237 +#define deactivate_mm(tsk,mm)  do { \
88238 +       load_gs_index(0); \
88239 +       asm volatile("movl %0,%%fs"::"r"(0));  \
88240 +} while(0)
88241 +
88242 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
88243 +{
88244 +       if (!next->context.pinned)
88245 +               mm_pin(next);
88246 +       switch_mm(prev, next, NULL);
88247 +}
88248 +
88249 +#endif
88250 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/msr.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/msr.h
88251 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/msr.h  1970-01-01 00:00:00.000000000 +0000
88252 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/msr.h        2007-02-02 19:10:58.000000000 +0000
88253 @@ -0,0 +1,410 @@
88254 +#ifndef X86_64_MSR_H
88255 +#define X86_64_MSR_H 1
88256 +
88257 +#ifndef __ASSEMBLY__
88258 +/*
88259 + * Access to machine-specific registers (available on 586 and better only)
88260 + * Note: the rd* operations modify the parameters directly (without using
88261 + * pointer indirection), this allows gcc to optimize better
88262 + */
88263 +
88264 +#define rdmsr(msr,val1,val2) \
88265 +       __asm__ __volatile__("rdmsr" \
88266 +                           : "=a" (val1), "=d" (val2) \
88267 +                           : "c" (msr))
88268 +
88269 +
88270 +#define rdmsrl(msr,val) do { unsigned long a__,b__; \
88271 +       __asm__ __volatile__("rdmsr" \
88272 +                           : "=a" (a__), "=d" (b__) \
88273 +                           : "c" (msr)); \
88274 +       val = a__ | (b__<<32); \
88275 +} while(0)
88276 +
88277 +#define wrmsr(msr,val1,val2) \
88278 +     __asm__ __volatile__("wrmsr" \
88279 +                         : /* no outputs */ \
88280 +                         : "c" (msr), "a" (val1), "d" (val2))
88281 +
88282 +#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32) 
88283 +
88284 +/* wrmsr with exception handling */
88285 +#define wrmsr_safe(msr,a,b) ({ int ret__;                      \
88286 +       asm volatile("2: wrmsr ; xorl %0,%0\n"                  \
88287 +                    "1:\n\t"                                   \
88288 +                    ".section .fixup,\"ax\"\n\t"               \
88289 +                    "3:  movl %4,%0 ; jmp 1b\n\t"              \
88290 +                    ".previous\n\t"                            \
88291 +                    ".section __ex_table,\"a\"\n"              \
88292 +                    "   .align 8\n\t"                          \
88293 +                    "   .quad  2b,3b\n\t"                      \
88294 +                    ".previous"                                \
88295 +                    : "=a" (ret__)                             \
88296 +                    : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
88297 +       ret__; })
88298 +
88299 +#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
88300 +
88301 +#define rdmsr_safe(msr,a,b) \
88302 +       ({ int ret__;                                           \
88303 +         asm volatile ("1:       rdmsr\n"                      \
88304 +                      "2:\n"                                   \
88305 +                      ".section .fixup,\"ax\"\n"               \
88306 +                      "3:       movl %4,%0\n"                  \
88307 +                      " jmp 2b\n"                              \
88308 +                      ".previous\n"                            \
88309 +                      ".section __ex_table,\"a\"\n"            \
88310 +                      " .align 8\n"                            \
88311 +                      " .quad 1b,3b\n"                         \
88312 +                      ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b))\
88313 +                      :"c"(msr), "i"(-EIO), "0"(0));           \
88314 +         ret__; })             
88315 +
88316 +#define rdtsc(low,high) \
88317 +     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
88318 +
88319 +#define rdtscl(low) \
88320 +     __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
88321 +
88322 +#define rdtscp(low,high,aux) \
88323 +     asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux))
88324 +
88325 +#define rdtscll(val) do { \
88326 +     unsigned int __a,__d; \
88327 +     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
88328 +     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
88329 +} while(0)
88330 +
88331 +#define rdtscpll(val, aux) do { \
88332 +     unsigned long __a, __d; \
88333 +     asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
88334 +     (val) = (__d << 32) | __a; \
88335 +} while (0)
88336 +
88337 +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
88338 +
88339 +#define write_rdtscp_aux(val) wrmsr(0xc0000103, val, 0)
88340 +
88341 +#define rdpmc(counter,low,high) \
88342 +     __asm__ __volatile__("rdpmc" \
88343 +                         : "=a" (low), "=d" (high) \
88344 +                         : "c" (counter))
88345 +
88346 +static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
88347 +                        unsigned int *ecx, unsigned int *edx)
88348 +{
88349 +       __asm__(XEN_CPUID
88350 +               : "=a" (*eax),
88351 +                 "=b" (*ebx),
88352 +                 "=c" (*ecx),
88353 +                 "=d" (*edx)
88354 +               : "0" (op));
88355 +}
88356 +
88357 +/* Some CPUID calls want 'count' to be placed in ecx */
88358 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
88359 +               int *edx)
88360 +{
88361 +       __asm__(XEN_CPUID
88362 +               : "=a" (*eax),
88363 +                 "=b" (*ebx),
88364 +                 "=c" (*ecx),
88365 +                 "=d" (*edx)
88366 +               : "0" (op), "c" (count));
88367 +}
88368 +
88369 +/*
88370 + * CPUID functions returning a single datum
88371 + */
88372 +static inline unsigned int cpuid_eax(unsigned int op)
88373 +{
88374 +       unsigned int eax;
88375 +
88376 +       __asm__(XEN_CPUID
88377 +               : "=a" (eax)
88378 +               : "0" (op)
88379 +               : "bx", "cx", "dx");
88380 +       return eax;
88381 +}
88382 +static inline unsigned int cpuid_ebx(unsigned int op)
88383 +{
88384 +       unsigned int eax, ebx;
88385 +
88386 +       __asm__(XEN_CPUID
88387 +               : "=a" (eax), "=b" (ebx)
88388 +               : "0" (op)
88389 +               : "cx", "dx" );
88390 +       return ebx;
88391 +}
88392 +static inline unsigned int cpuid_ecx(unsigned int op)
88393 +{
88394 +       unsigned int eax, ecx;
88395 +
88396 +       __asm__(XEN_CPUID
88397 +               : "=a" (eax), "=c" (ecx)
88398 +               : "0" (op)
88399 +               : "bx", "dx" );
88400 +       return ecx;
88401 +}
88402 +static inline unsigned int cpuid_edx(unsigned int op)
88403 +{
88404 +       unsigned int eax, edx;
88405 +
88406 +       __asm__(XEN_CPUID
88407 +               : "=a" (eax), "=d" (edx)
88408 +               : "0" (op)
88409 +               : "bx", "cx");
88410 +       return edx;
88411 +}
88412 +
88413 +#define MSR_IA32_UCODE_WRITE           0x79
88414 +#define MSR_IA32_UCODE_REV             0x8b
88415 +
88416 +
88417 +#endif
88418 +
88419 +/* AMD/K8 specific MSRs */ 
88420 +#define MSR_EFER 0xc0000080            /* extended feature register */
88421 +#define MSR_STAR 0xc0000081            /* legacy mode SYSCALL target */
88422 +#define MSR_LSTAR 0xc0000082           /* long mode SYSCALL target */
88423 +#define MSR_CSTAR 0xc0000083           /* compatibility mode SYSCALL target */
88424 +#define MSR_SYSCALL_MASK 0xc0000084    /* EFLAGS mask for syscall */
88425 +#define MSR_FS_BASE 0xc0000100         /* 64bit GS base */
88426 +#define MSR_GS_BASE 0xc0000101         /* 64bit FS base */
88427 +#define MSR_KERNEL_GS_BASE  0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ 
88428 +/* EFER bits: */ 
88429 +#define _EFER_SCE 0  /* SYSCALL/SYSRET */
88430 +#define _EFER_LME 8  /* Long mode enable */
88431 +#define _EFER_LMA 10 /* Long mode active (read-only) */
88432 +#define _EFER_NX 11  /* No execute enable */
88433 +
88434 +#define EFER_SCE (1<<_EFER_SCE)
88435 +#define EFER_LME (1<<_EFER_LME)
88436 +#define EFER_LMA (1<<_EFER_LMA)
88437 +#define EFER_NX (1<<_EFER_NX)
88438 +
88439 +/* Intel MSRs. Some also available on other CPUs */
88440 +#define MSR_IA32_TSC           0x10
88441 +#define MSR_IA32_PLATFORM_ID   0x17
88442 +
88443 +#define MSR_IA32_PERFCTR0      0xc1
88444 +#define MSR_IA32_PERFCTR1      0xc2
88445 +
88446 +#define MSR_MTRRcap            0x0fe
88447 +#define MSR_IA32_BBL_CR_CTL        0x119
88448 +
88449 +#define MSR_IA32_SYSENTER_CS   0x174
88450 +#define MSR_IA32_SYSENTER_ESP  0x175
88451 +#define MSR_IA32_SYSENTER_EIP  0x176
88452 +
88453 +#define MSR_IA32_MCG_CAP       0x179
88454 +#define MSR_IA32_MCG_STATUS        0x17a
88455 +#define MSR_IA32_MCG_CTL       0x17b
88456 +
88457 +#define MSR_IA32_EVNTSEL0      0x186
88458 +#define MSR_IA32_EVNTSEL1      0x187
88459 +
88460 +#define MSR_IA32_DEBUGCTLMSR       0x1d9
88461 +#define MSR_IA32_LASTBRANCHFROMIP  0x1db
88462 +#define MSR_IA32_LASTBRANCHTOIP        0x1dc
88463 +#define MSR_IA32_LASTINTFROMIP     0x1dd
88464 +#define MSR_IA32_LASTINTTOIP       0x1de
88465 +
88466 +#define MSR_MTRRfix64K_00000   0x250
88467 +#define MSR_MTRRfix16K_80000   0x258
88468 +#define MSR_MTRRfix16K_A0000   0x259
88469 +#define MSR_MTRRfix4K_C0000    0x268
88470 +#define MSR_MTRRfix4K_C8000    0x269
88471 +#define MSR_MTRRfix4K_D0000    0x26a
88472 +#define MSR_MTRRfix4K_D8000    0x26b
88473 +#define MSR_MTRRfix4K_E0000    0x26c
88474 +#define MSR_MTRRfix4K_E8000    0x26d
88475 +#define MSR_MTRRfix4K_F0000    0x26e
88476 +#define MSR_MTRRfix4K_F8000    0x26f
88477 +#define MSR_MTRRdefType                0x2ff
88478 +
88479 +#define MSR_IA32_MC0_CTL       0x400
88480 +#define MSR_IA32_MC0_STATUS        0x401
88481 +#define MSR_IA32_MC0_ADDR      0x402
88482 +#define MSR_IA32_MC0_MISC      0x403
88483 +
88484 +#define MSR_P6_PERFCTR0                        0xc1
88485 +#define MSR_P6_PERFCTR1                        0xc2
88486 +#define MSR_P6_EVNTSEL0                        0x186
88487 +#define MSR_P6_EVNTSEL1                        0x187
88488 +
88489 +/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */
88490 +#define MSR_K7_EVNTSEL0            0xC0010000
88491 +#define MSR_K7_PERFCTR0            0xC0010004
88492 +#define MSR_K7_EVNTSEL1            0xC0010001
88493 +#define MSR_K7_PERFCTR1            0xC0010005
88494 +#define MSR_K7_EVNTSEL2            0xC0010002
88495 +#define MSR_K7_PERFCTR2            0xC0010006
88496 +#define MSR_K7_EVNTSEL3            0xC0010003
88497 +#define MSR_K7_PERFCTR3            0xC0010007
88498 +#define MSR_K8_TOP_MEM1                   0xC001001A
88499 +#define MSR_K8_TOP_MEM2                   0xC001001D
88500 +#define MSR_K8_SYSCFG             0xC0010010
88501 +#define MSR_K8_HWCR               0xC0010015
88502 +
88503 +/* K6 MSRs */
88504 +#define MSR_K6_EFER                    0xC0000080
88505 +#define MSR_K6_STAR                    0xC0000081
88506 +#define MSR_K6_WHCR                    0xC0000082
88507 +#define MSR_K6_UWCCR                   0xC0000085
88508 +#define MSR_K6_PSOR                    0xC0000087
88509 +#define MSR_K6_PFIR                    0xC0000088
88510 +
88511 +/* Centaur-Hauls/IDT defined MSRs. */
88512 +#define MSR_IDT_FCR1                   0x107
88513 +#define MSR_IDT_FCR2                   0x108
88514 +#define MSR_IDT_FCR3                   0x109
88515 +#define MSR_IDT_FCR4                   0x10a
88516 +
88517 +#define MSR_IDT_MCR0                   0x110
88518 +#define MSR_IDT_MCR1                   0x111
88519 +#define MSR_IDT_MCR2                   0x112
88520 +#define MSR_IDT_MCR3                   0x113
88521 +#define MSR_IDT_MCR4                   0x114
88522 +#define MSR_IDT_MCR5                   0x115
88523 +#define MSR_IDT_MCR6                   0x116
88524 +#define MSR_IDT_MCR7                   0x117
88525 +#define MSR_IDT_MCR_CTRL               0x120
88526 +
88527 +/* VIA Cyrix defined MSRs*/
88528 +#define MSR_VIA_FCR                    0x1107
88529 +#define MSR_VIA_LONGHAUL               0x110a
88530 +#define MSR_VIA_RNG                    0x110b
88531 +#define MSR_VIA_BCR2                   0x1147
88532 +
88533 +/* Intel defined MSRs. */
88534 +#define MSR_IA32_P5_MC_ADDR            0
88535 +#define MSR_IA32_P5_MC_TYPE            1
88536 +#define MSR_IA32_PLATFORM_ID           0x17
88537 +#define MSR_IA32_EBL_CR_POWERON                0x2a
88538 +
88539 +#define MSR_IA32_APICBASE               0x1b
88540 +#define MSR_IA32_APICBASE_BSP           (1<<8)
88541 +#define MSR_IA32_APICBASE_ENABLE        (1<<11)
88542 +#define MSR_IA32_APICBASE_BASE          (0xfffff<<12)
88543 +
88544 +/* P4/Xeon+ specific */
88545 +#define MSR_IA32_MCG_EAX               0x180
88546 +#define MSR_IA32_MCG_EBX               0x181
88547 +#define MSR_IA32_MCG_ECX               0x182
88548 +#define MSR_IA32_MCG_EDX               0x183
88549 +#define MSR_IA32_MCG_ESI               0x184
88550 +#define MSR_IA32_MCG_EDI               0x185
88551 +#define MSR_IA32_MCG_EBP               0x186
88552 +#define MSR_IA32_MCG_ESP               0x187
88553 +#define MSR_IA32_MCG_EFLAGS            0x188
88554 +#define MSR_IA32_MCG_EIP               0x189
88555 +#define MSR_IA32_MCG_RESERVED          0x18A
88556 +
88557 +#define MSR_P6_EVNTSEL0                        0x186
88558 +#define MSR_P6_EVNTSEL1                        0x187
88559 +
88560 +#define MSR_IA32_PERF_STATUS           0x198
88561 +#define MSR_IA32_PERF_CTL              0x199
88562 +
88563 +#define MSR_IA32_THERM_CONTROL         0x19a
88564 +#define MSR_IA32_THERM_INTERRUPT       0x19b
88565 +#define MSR_IA32_THERM_STATUS          0x19c
88566 +#define MSR_IA32_MISC_ENABLE           0x1a0
88567 +
88568 +#define MSR_IA32_DEBUGCTLMSR           0x1d9
88569 +#define MSR_IA32_LASTBRANCHFROMIP      0x1db
88570 +#define MSR_IA32_LASTBRANCHTOIP                0x1dc
88571 +#define MSR_IA32_LASTINTFROMIP         0x1dd
88572 +#define MSR_IA32_LASTINTTOIP           0x1de
88573 +
88574 +#define MSR_IA32_MC0_CTL               0x400
88575 +#define MSR_IA32_MC0_STATUS            0x401
88576 +#define MSR_IA32_MC0_ADDR              0x402
88577 +#define MSR_IA32_MC0_MISC              0x403
88578 +
88579 +/* Pentium IV performance counter MSRs */
88580 +#define MSR_P4_BPU_PERFCTR0            0x300
88581 +#define MSR_P4_BPU_PERFCTR1            0x301
88582 +#define MSR_P4_BPU_PERFCTR2            0x302
88583 +#define MSR_P4_BPU_PERFCTR3            0x303
88584 +#define MSR_P4_MS_PERFCTR0             0x304
88585 +#define MSR_P4_MS_PERFCTR1             0x305
88586 +#define MSR_P4_MS_PERFCTR2             0x306
88587 +#define MSR_P4_MS_PERFCTR3             0x307
88588 +#define MSR_P4_FLAME_PERFCTR0          0x308
88589 +#define MSR_P4_FLAME_PERFCTR1          0x309
88590 +#define MSR_P4_FLAME_PERFCTR2          0x30a
88591 +#define MSR_P4_FLAME_PERFCTR3          0x30b
88592 +#define MSR_P4_IQ_PERFCTR0             0x30c
88593 +#define MSR_P4_IQ_PERFCTR1             0x30d
88594 +#define MSR_P4_IQ_PERFCTR2             0x30e
88595 +#define MSR_P4_IQ_PERFCTR3             0x30f
88596 +#define MSR_P4_IQ_PERFCTR4             0x310
88597 +#define MSR_P4_IQ_PERFCTR5             0x311
88598 +#define MSR_P4_BPU_CCCR0               0x360
88599 +#define MSR_P4_BPU_CCCR1               0x361
88600 +#define MSR_P4_BPU_CCCR2               0x362
88601 +#define MSR_P4_BPU_CCCR3               0x363
88602 +#define MSR_P4_MS_CCCR0                0x364
88603 +#define MSR_P4_MS_CCCR1                0x365
88604 +#define MSR_P4_MS_CCCR2                0x366
88605 +#define MSR_P4_MS_CCCR3                0x367
88606 +#define MSR_P4_FLAME_CCCR0             0x368
88607 +#define MSR_P4_FLAME_CCCR1             0x369
88608 +#define MSR_P4_FLAME_CCCR2             0x36a
88609 +#define MSR_P4_FLAME_CCCR3             0x36b
88610 +#define MSR_P4_IQ_CCCR0                0x36c
88611 +#define MSR_P4_IQ_CCCR1                0x36d
88612 +#define MSR_P4_IQ_CCCR2                0x36e
88613 +#define MSR_P4_IQ_CCCR3                0x36f
88614 +#define MSR_P4_IQ_CCCR4                0x370
88615 +#define MSR_P4_IQ_CCCR5                0x371
88616 +#define MSR_P4_ALF_ESCR0               0x3ca
88617 +#define MSR_P4_ALF_ESCR1               0x3cb
88618 +#define MSR_P4_BPU_ESCR0               0x3b2
88619 +#define MSR_P4_BPU_ESCR1               0x3b3
88620 +#define MSR_P4_BSU_ESCR0               0x3a0
88621 +#define MSR_P4_BSU_ESCR1               0x3a1
88622 +#define MSR_P4_CRU_ESCR0               0x3b8
88623 +#define MSR_P4_CRU_ESCR1               0x3b9
88624 +#define MSR_P4_CRU_ESCR2               0x3cc
88625 +#define MSR_P4_CRU_ESCR3               0x3cd
88626 +#define MSR_P4_CRU_ESCR4               0x3e0
88627 +#define MSR_P4_CRU_ESCR5               0x3e1
88628 +#define MSR_P4_DAC_ESCR0               0x3a8
88629 +#define MSR_P4_DAC_ESCR1               0x3a9
88630 +#define MSR_P4_FIRM_ESCR0              0x3a4
88631 +#define MSR_P4_FIRM_ESCR1              0x3a5
88632 +#define MSR_P4_FLAME_ESCR0             0x3a6
88633 +#define MSR_P4_FLAME_ESCR1             0x3a7
88634 +#define MSR_P4_FSB_ESCR0               0x3a2
88635 +#define MSR_P4_FSB_ESCR1               0x3a3
88636 +#define MSR_P4_IQ_ESCR0                0x3ba
88637 +#define MSR_P4_IQ_ESCR1                0x3bb
88638 +#define MSR_P4_IS_ESCR0                0x3b4
88639 +#define MSR_P4_IS_ESCR1                0x3b5
88640 +#define MSR_P4_ITLB_ESCR0              0x3b6
88641 +#define MSR_P4_ITLB_ESCR1              0x3b7
88642 +#define MSR_P4_IX_ESCR0                0x3c8
88643 +#define MSR_P4_IX_ESCR1                0x3c9
88644 +#define MSR_P4_MOB_ESCR0               0x3aa
88645 +#define MSR_P4_MOB_ESCR1               0x3ab
88646 +#define MSR_P4_MS_ESCR0                0x3c0
88647 +#define MSR_P4_MS_ESCR1                0x3c1
88648 +#define MSR_P4_PMH_ESCR0               0x3ac
88649 +#define MSR_P4_PMH_ESCR1               0x3ad
88650 +#define MSR_P4_RAT_ESCR0               0x3bc
88651 +#define MSR_P4_RAT_ESCR1               0x3bd
88652 +#define MSR_P4_SAAT_ESCR0              0x3ae
88653 +#define MSR_P4_SAAT_ESCR1              0x3af
88654 +#define MSR_P4_SSU_ESCR0               0x3be
88655 +#define MSR_P4_SSU_ESCR1               0x3bf    /* guess: not defined in manual */
88656 +#define MSR_P4_TBPU_ESCR0              0x3c2
88657 +#define MSR_P4_TBPU_ESCR1              0x3c3
88658 +#define MSR_P4_TC_ESCR0                0x3c4
88659 +#define MSR_P4_TC_ESCR1                0x3c5
88660 +#define MSR_P4_U2L_ESCR0               0x3b0
88661 +#define MSR_P4_U2L_ESCR1               0x3b1
88662 +
88663 +#endif
88664 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/nmi.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/nmi.h
88665 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/nmi.h  1970-01-01 00:00:00.000000000 +0000
88666 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/nmi.h        2007-02-02 19:10:58.000000000 +0000
88667 @@ -0,0 +1,96 @@
88668 +/*
88669 + *  linux/include/asm-i386/nmi.h
88670 + */
88671 +#ifndef ASM_NMI_H
88672 +#define ASM_NMI_H
88673 +
88674 +#include <linux/pm.h>
88675 +#include <asm/io.h>
88676 +
88677 +#include <xen/interface/nmi.h>
88678 +
88679 +/**
88680 + * do_nmi_callback
88681 + *
88682 + * Check to see if a callback exists and execute it.  Return 1
88683 + * if the handler exists and was handled successfully.
88684 + */
88685 +int do_nmi_callback(struct pt_regs *regs, int cpu);
88686 +
88687 +#ifdef CONFIG_PM
88688
88689 +/** Replace the PM callback routine for NMI. */
88690 +struct pm_dev * set_nmi_pm_callback(pm_callback callback);
88691 +
88692 +/** Unset the PM callback routine back to the default. */
88693 +void unset_nmi_pm_callback(struct pm_dev * dev);
88694 +
88695 +#else
88696 +
88697 +static inline struct pm_dev * set_nmi_pm_callback(pm_callback callback)
88698 +{
88699 +       return 0;
88700 +} 
88701
88702 +static inline void unset_nmi_pm_callback(struct pm_dev * dev)
88703 +{
88704 +}
88705 +
88706 +#endif /* CONFIG_PM */
88707
88708 +extern void default_do_nmi(struct pt_regs *);
88709 +extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
88710 +
88711 +static inline unsigned char get_nmi_reason(void)
88712 +{
88713 +        shared_info_t *s = HYPERVISOR_shared_info;
88714 +        unsigned char reason = 0;
88715 +
88716 +        /* construct a value which looks like it came from
88717 +         * port 0x61.
88718 +         */
88719 +        if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
88720 +                reason |= 0x40;
88721 +        if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
88722 +                reason |= 0x80;
88723 +
88724 +        return reason;
88725 +}
88726 +
88727 +extern int panic_on_timeout;
88728 +extern int unknown_nmi_panic;
88729 +extern int nmi_watchdog_enabled;
88730 +
88731 +extern int check_nmi_watchdog(void);
88732 +extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
88733 +extern int avail_to_resrv_perfctr_nmi(unsigned int);
88734 +extern int reserve_perfctr_nmi(unsigned int);
88735 +extern void release_perfctr_nmi(unsigned int);
88736 +extern int reserve_evntsel_nmi(unsigned int);
88737 +extern void release_evntsel_nmi(unsigned int);
88738 +
88739 +extern void setup_apic_nmi_watchdog (void *);
88740 +extern void stop_apic_nmi_watchdog (void *);
88741 +extern void disable_timer_nmi_watchdog(void);
88742 +extern void enable_timer_nmi_watchdog(void);
88743 +extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
88744 +
88745 +extern void nmi_watchdog_default(void);
88746 +extern int setup_nmi_watchdog(char *);
88747 +
88748 +extern atomic_t nmi_active;
88749 +extern unsigned int nmi_watchdog;
88750 +#define NMI_DEFAULT    -1
88751 +#define NMI_NONE       0
88752 +#define NMI_IO_APIC    1
88753 +#define NMI_LOCAL_APIC 2
88754 +#define NMI_INVALID    3
88755 +
88756 +struct ctl_table;
88757 +struct file;
88758 +extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
88759 +                       void __user *, size_t *, loff_t *);
88760 +
88761 +extern int unknown_nmi_panic;
88762 +
88763 +#endif /* ASM_NMI_H */
88764 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/page.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/page.h
88765 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/page.h 1970-01-01 00:00:00.000000000 +0000
88766 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/page.h       2007-02-02 19:10:58.000000000 +0000
88767 @@ -0,0 +1,214 @@
88768 +#ifndef _X86_64_PAGE_H
88769 +#define _X86_64_PAGE_H
88770 +
88771 +/* #include <linux/string.h> */
88772 +#ifndef __ASSEMBLY__
88773 +#include <linux/kernel.h>
88774 +#include <linux/types.h>
88775 +#include <asm/bug.h>
88776 +#endif
88777 +#include <xen/interface/xen.h> 
88778 +#include <xen/foreign_page.h>
88779 +
88780 +#define arch_free_page(_page,_order)                   \
88781 +({     int foreign = PageForeign(_page);               \
88782 +       if (foreign)                                    \
88783 +               (PageForeignDestructor(_page))(_page);  \
88784 +       foreign;                                        \
88785 +})
88786 +#define HAVE_ARCH_FREE_PAGE
88787 +
88788 +#ifdef CONFIG_XEN_SCRUB_PAGES
88789 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
88790 +#else
88791 +#define scrub_pages(_p,_n) ((void)0)
88792 +#endif
88793 +
88794 +/* PAGE_SHIFT determines the page size */
88795 +#define PAGE_SHIFT     12
88796 +#ifdef __ASSEMBLY__
88797 +#define PAGE_SIZE      (0x1 << PAGE_SHIFT)
88798 +#else
88799 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
88800 +#endif
88801 +#define PAGE_MASK      (~(PAGE_SIZE-1))
88802 +
88803 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
88804 +#define __PHYSICAL_MASK_SHIFT  46
88805 +#define __PHYSICAL_MASK                ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
88806 +#define __VIRTUAL_MASK_SHIFT   48
88807 +#define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
88808 +
88809 +#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
88810 +
88811 +#define THREAD_ORDER 1 
88812 +#define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
88813 +#define CURRENT_MASK (~(THREAD_SIZE-1))
88814 +
88815 +#define EXCEPTION_STACK_ORDER 0
88816 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
88817 +
88818 +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
88819 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
88820 +
88821 +#define IRQSTACK_ORDER 2
88822 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
88823 +
88824 +#define STACKFAULT_STACK 1
88825 +#define DOUBLEFAULT_STACK 2
88826 +#define NMI_STACK 3
88827 +#define DEBUG_STACK 4
88828 +#define MCE_STACK 5
88829 +#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
88830 +
88831 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
88832 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
88833 +
88834 +#define HPAGE_SHIFT PMD_SHIFT
88835 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
88836 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
88837 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
88838 +
88839 +#ifdef __KERNEL__
88840 +#ifndef __ASSEMBLY__
88841 +
88842 +extern unsigned long end_pfn;
88843 +
88844 +#include <asm/maddr.h>
88845 +
88846 +void clear_page(void *);
88847 +void copy_page(void *, void *);
88848 +
88849 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
88850 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
88851 +
88852 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
88853 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
88854 +/*
88855 + * These are used to make use of C type-checking..
88856 + */
88857 +typedef struct { unsigned long pte; } pte_t;
88858 +typedef struct { unsigned long pmd; } pmd_t;
88859 +typedef struct { unsigned long pud; } pud_t;
88860 +typedef struct { unsigned long pgd; } pgd_t;
88861 +#define PTE_MASK       PHYSICAL_PAGE_MASK
88862 +
88863 +typedef struct { unsigned long pgprot; } pgprot_t;
88864 +
88865 +#define pte_val(x)     (((x).pte & 1) ? pte_machine_to_phys((x).pte) : \
88866 +                        (x).pte)
88867 +#define pte_val_ma(x)  ((x).pte)
88868 +
88869 +static inline unsigned long pmd_val(pmd_t x)
88870 +{
88871 +       unsigned long ret = x.pmd;
88872 +       if (ret) ret = pte_machine_to_phys(ret);
88873 +       return ret;
88874 +}
88875 +
88876 +static inline unsigned long pud_val(pud_t x)
88877 +{
88878 +       unsigned long ret = x.pud;
88879 +       if (ret) ret = pte_machine_to_phys(ret);
88880 +       return ret;
88881 +}
88882 +
88883 +static inline unsigned long pgd_val(pgd_t x)
88884 +{
88885 +       unsigned long ret = x.pgd;
88886 +       if (ret) ret = pte_machine_to_phys(ret);
88887 +       return ret;
88888 +}
88889 +
88890 +#define pgprot_val(x)  ((x).pgprot)
88891 +
88892 +static inline pte_t __pte(unsigned long x)
88893 +{
88894 +       if (x & 1) x = phys_to_machine(x);
88895 +       return ((pte_t) { (x) });
88896 +}
88897 +
88898 +static inline pmd_t __pmd(unsigned long x)
88899 +{
88900 +       if ((x & 1)) x = phys_to_machine(x);
88901 +       return ((pmd_t) { (x) });
88902 +}
88903 +
88904 +static inline pud_t __pud(unsigned long x)
88905 +{
88906 +       if ((x & 1)) x = phys_to_machine(x);
88907 +       return ((pud_t) { (x) });
88908 +}
88909 +
88910 +static inline pgd_t __pgd(unsigned long x)
88911 +{
88912 +       if ((x & 1)) x = phys_to_machine(x);
88913 +       return ((pgd_t) { (x) });
88914 +}
88915 +
88916 +#define __pgprot(x)    ((pgprot_t) { (x) } )
88917 +
88918 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
88919 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
88920 +#define __START_KERNEL_map     0xffffffff80000000UL
88921 +#define __PAGE_OFFSET           0xffff880000000000UL
88922 +
88923 +#else
88924 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
88925 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
88926 +#define __START_KERNEL_map     0xffffffff80000000
88927 +#define __PAGE_OFFSET           0xffff880000000000
88928 +#endif /* !__ASSEMBLY__ */
88929 +
88930 +#ifdef CONFIG_XEN_COMPAT_030002
88931 +#undef LOAD_OFFSET
88932 +#define LOAD_OFFSET            0
88933 +#endif /* CONFIG_XEN_COMPAT_030002 */
88934 +
88935 +/* to align the pointer to the (next) page boundary */
88936 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
88937 +
88938 +#define KERNEL_TEXT_SIZE  (40UL*1024*1024)
88939 +#define KERNEL_TEXT_START 0xffffffff80000000UL 
88940 +
88941 +#ifndef __ASSEMBLY__
88942 +
88943 +#include <asm/bug.h>
88944 +
88945 +#endif /* __ASSEMBLY__ */
88946 +
88947 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
88948 +
88949 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
88950 +   Otherwise you risk miscompilation. */ 
88951 +#define __pa(x)                        (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
88952 +/* __pa_symbol should be used for C visible symbols.
88953 +   This seems to be the official gcc blessed way to do such arithmetic. */ 
88954 +#define __pa_symbol(x)         \
88955 +       ({unsigned long v;  \
88956 +         asm("" : "=r" (v) : "0" (x)); \
88957 +         __pa(v); })
88958 +
88959 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
88960 +#define __boot_va(x)           __va(x)
88961 +#define __boot_pa(x)           __pa(x)
88962 +#ifdef CONFIG_FLATMEM
88963 +#define pfn_valid(pfn)         ((pfn) < end_pfn)
88964 +#endif
88965 +
88966 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
88967 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
88968 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
88969 +
88970 +#define VM_DATA_DEFAULT_FLAGS \
88971 +       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
88972 +        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
88973 +
88974 +#define __HAVE_ARCH_GATE_AREA 1        
88975 +
88976 +#include <asm-generic/memory_model.h>
88977 +#include <asm-generic/page.h>
88978 +
88979 +#endif /* __KERNEL__ */
88980 +
88981 +#endif /* _X86_64_PAGE_H */
88982 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/pci.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/pci.h
88983 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/pci.h  1970-01-01 00:00:00.000000000 +0000
88984 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/pci.h        2007-02-02 19:10:58.000000000 +0000
88985 @@ -0,0 +1,173 @@
88986 +#ifndef __x8664_PCI_H
88987 +#define __x8664_PCI_H
88988 +
88989 +#include <asm/io.h>
88990 +
88991 +#ifdef __KERNEL__
88992 +
88993 +#include <linux/mm.h> /* for struct page */
88994 +
88995 +/* Can be used to override the logic in pci_scan_bus for skipping
88996 +   already-configured bus numbers - to be used for buggy BIOSes
88997 +   or architectures with incomplete PCI setup by the loader */
88998 +
88999 +#ifdef CONFIG_PCI
89000 +extern unsigned int pcibios_assign_all_busses(void);
89001 +#else
89002 +#define pcibios_assign_all_busses()    0
89003 +#endif
89004 +#define pcibios_scan_all_fns(a, b)     0
89005 +
89006 +extern unsigned long pci_mem_start;
89007 +#define PCIBIOS_MIN_IO         0x1000
89008 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
89009 +
89010 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
89011 +
89012 +void pcibios_config_init(void);
89013 +struct pci_bus * pcibios_scan_root(int bus);
89014 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
89015 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
89016 +
89017 +void pcibios_set_master(struct pci_dev *dev);
89018 +void pcibios_penalize_isa_irq(int irq, int active);
89019 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
89020 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
89021 +
89022 +#include <linux/types.h>
89023 +#include <linux/slab.h>
89024 +#include <asm/scatterlist.h>
89025 +#include <linux/string.h>
89026 +#include <asm/page.h>
89027 +
89028 +extern void pci_iommu_alloc(void);
89029 +extern int iommu_setup(char *opt);
89030 +
89031 +/* The PCI address space does equal the physical memory
89032 + * address space.  The networking and block device layers use
89033 + * this boolean for bounce buffer decisions
89034 + *
89035 + * On AMD64 it mostly equals, but we set it to zero if a hardware
89036 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
89037 + */
89038 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
89039 +
89040 +#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
89041 +
89042 +/*
89043 + * x86-64 always supports DAC, but sometimes it is useful to force
89044 + * devices through the IOMMU to get automatic sg list merging.
89045 + * Optional right now.
89046 + */
89047 +extern int iommu_sac_force;
89048 +#define pci_dac_dma_supported(pci_dev, mask)   (!iommu_sac_force)
89049 +
89050 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
89051 +       dma_addr_t ADDR_NAME;
89052 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
89053 +       __u32 LEN_NAME;
89054 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
89055 +       ((PTR)->ADDR_NAME)
89056 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
89057 +       (((PTR)->ADDR_NAME) = (VAL))
89058 +#define pci_unmap_len(PTR, LEN_NAME)                   \
89059 +       ((PTR)->LEN_NAME)
89060 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
89061 +       (((PTR)->LEN_NAME) = (VAL))
89062 +
89063 +#elif defined(CONFIG_SWIOTLB)
89064 +
89065 +#define pci_dac_dma_supported(pci_dev, mask)    1
89066 +
89067 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
89068 +       dma_addr_t ADDR_NAME;
89069 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
89070 +       __u32 LEN_NAME;
89071 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
89072 +       ((PTR)->ADDR_NAME)
89073 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
89074 +       (((PTR)->ADDR_NAME) = (VAL))
89075 +#define pci_unmap_len(PTR, LEN_NAME)                   \
89076 +       ((PTR)->LEN_NAME)
89077 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
89078 +       (((PTR)->LEN_NAME) = (VAL))
89079 +
89080 +#else
89081 +/* No IOMMU */
89082 +
89083 +#define pci_dac_dma_supported(pci_dev, mask)    1
89084 +
89085 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
89086 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
89087 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
89088 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
89089 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
89090 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
89091 +
89092 +#endif
89093 +
89094 +#include <asm-generic/pci-dma-compat.h>
89095 +
89096 +static inline dma64_addr_t
89097 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
89098 +{
89099 +       return ((dma64_addr_t) page_to_phys(page) +
89100 +               (dma64_addr_t) offset);
89101 +}
89102 +
89103 +static inline struct page *
89104 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
89105 +{
89106 +       return virt_to_page(__va(dma_addr));    
89107 +}
89108 +
89109 +static inline unsigned long
89110 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
89111 +{
89112 +       return (dma_addr & ~PAGE_MASK);
89113 +}
89114 +
89115 +static inline void
89116 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
89117 +{
89118 +}
89119 +
89120 +static inline void
89121 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
89122 +{
89123 +       flush_write_buffers();
89124 +}
89125 +
89126 +#ifdef CONFIG_PCI
89127 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
89128 +                                       enum pci_dma_burst_strategy *strat,
89129 +                                       unsigned long *strategy_parameter)
89130 +{
89131 +       *strat = PCI_DMA_BURST_INFINITY;
89132 +       *strategy_parameter = ~0UL;
89133 +}
89134 +#endif
89135 +
89136 +#define HAVE_PCI_MMAP
89137 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
89138 +                              enum pci_mmap_state mmap_state, int write_combine);
89139 +
89140 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
89141 +{
89142 +}
89143 +
89144 +#endif /* __KERNEL__ */
89145 +
89146 +/* generic pci stuff */
89147 +#ifdef CONFIG_PCI
89148 +#include <asm-generic/pci.h>
89149 +#endif
89150 +
89151 +/* On Xen we have to scan all functions since Xen hides bridges from
89152 + * us.  If a bridge is at fn=0 and that slot has a multifunction
89153 + * device, we won't find the additional devices without scanning all
89154 + * functions. */
89155 +#undef pcibios_scan_all_fns
89156 +#define pcibios_scan_all_fns(a, b)     1
89157 +
89158 +#endif /* __x8664_PCI_H */
89159 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/pgalloc.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/pgalloc.h
89160 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/pgalloc.h      1970-01-01 00:00:00.000000000 +0000
89161 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/pgalloc.h    2007-02-02 19:10:58.000000000 +0000
89162 @@ -0,0 +1,224 @@
89163 +#ifndef _X86_64_PGALLOC_H
89164 +#define _X86_64_PGALLOC_H
89165 +
89166 +#include <asm/fixmap.h>
89167 +#include <asm/pda.h>
89168 +#include <linux/threads.h>
89169 +#include <linux/mm.h>
89170 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
89171 +
89172 +#include <xen/features.h>
89173 +void make_page_readonly(void *va, unsigned int feature);
89174 +void make_page_writable(void *va, unsigned int feature);
89175 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
89176 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
89177 +
89178 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
89179 +
89180 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
89181 +{
89182 +       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
89183 +}
89184 +
89185 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
89186 +{
89187 +       if (unlikely((mm)->context.pinned)) {
89188 +               BUG_ON(HYPERVISOR_update_va_mapping(
89189 +                              (unsigned long)pmd,
89190 +                              pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, 
89191 +                                      PAGE_KERNEL_RO), 0));
89192 +               set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
89193 +       } else {
89194 +               *(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
89195 +       }
89196 +}
89197 +
89198 +/*
89199 + * We need to use the batch mode here, but pgd_pupulate() won't be
89200 + * be called frequently.
89201 + */
89202 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
89203 +{
89204 +       if (unlikely((mm)->context.pinned)) {
89205 +               BUG_ON(HYPERVISOR_update_va_mapping(
89206 +                              (unsigned long)pud,
89207 +                              pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, 
89208 +                                      PAGE_KERNEL_RO), 0));
89209 +               set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
89210 +               set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
89211 +       } else {
89212 +               *(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
89213 +               *(__user_pgd(pgd)) = *(pgd);
89214 +       }
89215 +}
89216 +
89217 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
89218 +{
89219 +       if (unlikely((mm)->context.pinned)) {
89220 +               BUG_ON(HYPERVISOR_update_va_mapping(
89221 +                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
89222 +                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
89223 +               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
89224 +       } else {
89225 +               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
89226 +       }
89227 +}
89228 +
89229 +static inline void pmd_free(pmd_t *pmd)
89230 +{
89231 +       pte_t *ptep = virt_to_ptep(pmd);
89232 +
89233 +       if (!pte_write(*ptep)) {
89234 +               BUG_ON(HYPERVISOR_update_va_mapping(
89235 +                       (unsigned long)pmd,
89236 +                       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
89237 +                       0));
89238 +       }
89239 +       free_page((unsigned long)pmd);
89240 +}
89241 +
89242 +static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
89243 +{
89244 +       return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
89245 +}
89246 +
89247 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
89248 +{
89249 +       return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
89250 +}
89251 +
89252 +static inline void pud_free (pud_t *pud)
89253 +{
89254 +       pte_t *ptep = virt_to_ptep(pud);
89255 +
89256 +       if (!pte_write(*ptep)) {
89257 +               BUG_ON(HYPERVISOR_update_va_mapping(
89258 +                       (unsigned long)pud,
89259 +                       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
89260 +                       0));
89261 +       }
89262 +       free_page((unsigned long)pud);
89263 +}
89264 +
89265 +static inline void pgd_list_add(pgd_t *pgd)
89266 +{
89267 +       struct page *page = virt_to_page(pgd);
89268 +
89269 +       spin_lock(&pgd_lock);
89270 +       page->index = (pgoff_t)pgd_list;
89271 +       if (pgd_list)
89272 +               pgd_list->private = (unsigned long)&page->index;
89273 +       pgd_list = page;
89274 +       page->private = (unsigned long)&pgd_list;
89275 +       spin_unlock(&pgd_lock);
89276 +}
89277 +
89278 +static inline void pgd_list_del(pgd_t *pgd)
89279 +{
89280 +       struct page *next, **pprev, *page = virt_to_page(pgd);
89281 +
89282 +       spin_lock(&pgd_lock);
89283 +       next = (struct page *)page->index;
89284 +       pprev = (struct page **)page->private;
89285 +       *pprev = next;
89286 +       if (next)
89287 +               next->private = (unsigned long)pprev;
89288 +       spin_unlock(&pgd_lock);
89289 +}
89290 +
89291 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
89292 +{
89293 +        /*
89294 +         * We allocate two contiguous pages for kernel and user.
89295 +         */
89296 +        unsigned boundary;
89297 +       pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
89298 +
89299 +       if (!pgd)
89300 +               return NULL;
89301 +       pgd_list_add(pgd);
89302 +       /*
89303 +        * Copy kernel pointers in from init.
89304 +        * Could keep a freelist or slab cache of those because the kernel
89305 +        * part never changes.
89306 +        */
89307 +       boundary = pgd_index(__PAGE_OFFSET);
89308 +       memset(pgd, 0, boundary * sizeof(pgd_t));
89309 +       memcpy(pgd + boundary,
89310 +              init_level4_pgt + boundary,
89311 +              (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
89312 +
89313 +       memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
89314 +        /*
89315 +         * Set level3_user_pgt for vsyscall area
89316 +         */
89317 +       set_pgd(__user_pgd(pgd) + pgd_index(VSYSCALL_START), 
89318 +                mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
89319 +       return pgd;
89320 +}
89321 +
89322 +static inline void pgd_free(pgd_t *pgd)
89323 +{
89324 +       pte_t *ptep = virt_to_ptep(pgd);
89325 +
89326 +       if (!pte_write(*ptep)) {
89327 +               xen_pgd_unpin(__pa(pgd));
89328 +               BUG_ON(HYPERVISOR_update_va_mapping(
89329 +                              (unsigned long)pgd,
89330 +                              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
89331 +                              0));
89332 +       }
89333 +
89334 +       ptep = virt_to_ptep(__user_pgd(pgd));
89335 +
89336 +       if (!pte_write(*ptep)) {
89337 +               xen_pgd_unpin(__pa(__user_pgd(pgd)));
89338 +               BUG_ON(HYPERVISOR_update_va_mapping(
89339 +                              (unsigned long)__user_pgd(pgd),
89340 +                              pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, 
89341 +                                      PAGE_KERNEL),
89342 +                              0));
89343 +       }
89344 +
89345 +       pgd_list_del(pgd);
89346 +       free_pages((unsigned long)pgd, 1);
89347 +}
89348 +
89349 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
89350 +{
89351 +        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
89352 +        if (pte)
89353 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
89354 +
89355 +       return pte;
89356 +}
89357 +
89358 +static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
89359 +{
89360 +       struct page *pte;
89361 +
89362 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
89363 +       return pte;
89364 +}
89365 +
89366 +/* Should really implement gc for free page table pages. This could be
89367 +   done with a reference count in struct page. */
89368 +
89369 +static inline void pte_free_kernel(pte_t *pte)
89370 +{
89371 +       BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
89372 +        make_page_writable(pte, XENFEAT_writable_page_tables);
89373 +       free_page((unsigned long)pte); 
89374 +}
89375 +
89376 +extern void pte_free(struct page *pte);
89377 +
89378 +//#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) 
89379 +//#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
89380 +//#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
89381 +
89382 +#define __pte_free_tlb(tlb,x)   pte_free((x))
89383 +#define __pmd_free_tlb(tlb,x)   pmd_free((x))
89384 +#define __pud_free_tlb(tlb,x)   pud_free((x))
89385 +
89386 +#endif /* _X86_64_PGALLOC_H */
89387 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/pgtable.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/pgtable.h
89388 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/pgtable.h      1970-01-01 00:00:00.000000000 +0000
89389 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/pgtable.h    2007-02-02 19:10:58.000000000 +0000
89390 @@ -0,0 +1,562 @@
89391 +#ifndef _X86_64_PGTABLE_H
89392 +#define _X86_64_PGTABLE_H
89393 +
89394 +/*
89395 + * This file contains the functions and defines necessary to modify and use
89396 + * the x86-64 page table tree.
89397 + */
89398 +#include <asm/processor.h>
89399 +#include <asm/fixmap.h>
89400 +#include <asm/bitops.h>
89401 +#include <linux/threads.h>
89402 +#include <linux/sched.h>
89403 +#include <asm/pda.h>
89404 +#ifdef CONFIG_XEN
89405 +#include <asm/hypervisor.h>
89406 +
89407 +extern pud_t level3_user_pgt[512];
89408 +extern pud_t init_level4_user_pgt[];
89409 +
89410 +extern void xen_init_pt(void);
89411 +
89412 +#define virt_to_ptep(__va)                                             \
89413 +({                                                                     \
89414 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
89415 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
89416 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
89417 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
89418 +})
89419 +
89420 +#define arbitrary_virt_to_machine(__va)                                        \
89421 +({                                                                     \
89422 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
89423 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
89424 +})
89425 +#endif
89426 +
89427 +extern pud_t level3_kernel_pgt[512];
89428 +extern pud_t level3_physmem_pgt[512];
89429 +extern pud_t level3_ident_pgt[512];
89430 +extern pmd_t level2_kernel_pgt[512];
89431 +extern pgd_t init_level4_pgt[];
89432 +extern pgd_t boot_level4_pgt[];
89433 +extern unsigned long __supported_pte_mask;
89434 +
89435 +#define swapper_pg_dir init_level4_pgt
89436 +
89437 +extern void paging_init(void);
89438 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
89439 +
89440 +/*
89441 + * ZERO_PAGE is a global shared page that is always zero: used
89442 + * for zero-mapped memory areas etc..
89443 + */
89444 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
89445 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
89446 +
89447 +/*
89448 + * PGDIR_SHIFT determines what a top-level page table entry can map
89449 + */
89450 +#define PGDIR_SHIFT    39
89451 +#define PTRS_PER_PGD   512
89452 +
89453 +/*
89454 + * 3rd level page
89455 + */
89456 +#define PUD_SHIFT      30
89457 +#define PTRS_PER_PUD   512
89458 +
89459 +/*
89460 + * PMD_SHIFT determines the size of the area a middle-level
89461 + * page table can map
89462 + */
89463 +#define PMD_SHIFT      21
89464 +#define PTRS_PER_PMD   512
89465 +
89466 +/*
89467 + * entries per page directory level
89468 + */
89469 +#define PTRS_PER_PTE   512
89470 +
89471 +#define pte_ERROR(e) \
89472 +       printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
89473 +#define pmd_ERROR(e) \
89474 +       printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
89475 +#define pud_ERROR(e) \
89476 +       printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
89477 +#define pgd_ERROR(e) \
89478 +       printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
89479 +
89480 +#define pgd_none(x)    (!pgd_val(x))
89481 +#define pud_none(x)    (!pud_val(x))
89482 +
89483 +#define set_pte_batched(pteptr, pteval) \
89484 +       queue_l1_entry_update(pteptr, (pteval))
89485 +
89486 +static inline void set_pte(pte_t *dst, pte_t val)
89487 +{
89488 +       *dst = val;
89489 +}
89490 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
89491 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
89492 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
89493 +               set_pte((ptep), (pteval));                              \
89494 +} while (0)
89495 +
89496 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
89497 +
89498 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
89499 +
89500 +static inline void pud_clear (pud_t *pud)
89501 +{
89502 +       set_pud(pud, __pud(0));
89503 +}
89504 +
89505 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
89506 +
89507 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
89508 +
89509 +static inline void pgd_clear (pgd_t * pgd)
89510 +{
89511 +        set_pgd(pgd, __pgd(0));
89512 +        set_pgd(__user_pgd(pgd), __pgd(0));
89513 +}
89514 +
89515 +/*
89516 + * A note on implementation of this atomic 'get-and-clear' operation.
89517 + * This is actually very simple because Xen Linux can only run on a single
89518 + * processor. Therefore, we cannot race other processors setting the 'accessed'
89519 + * or 'dirty' bits on a page-table entry.
89520 + * Even if pages are shared between domains, that is not a problem because
89521 + * each domain will have separate page tables, with their own versions of
89522 + * accessed & dirty state.
89523 + */
89524 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0))
89525 +
89526 +#if 0
89527 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
89528 +{
89529 +        pte_t pte = *xp;
89530 +        if (pte.pte)
89531 +                set_pte(xp, __pte_ma(0));
89532 +        return pte;
89533 +}
89534 +#endif
89535 +
89536 +struct mm_struct;
89537 +
89538 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
89539 +{
89540 +       pte_t pte;
89541 +       if (full) {
89542 +               pte = *ptep;
89543 +               *ptep = __pte(0);
89544 +       } else {
89545 +               pte = ptep_get_and_clear(mm, addr, ptep);
89546 +       }
89547 +       return pte;
89548 +}
89549 +
89550 +#define pte_same(a, b)         ((a).pte == (b).pte)
89551 +
89552 +#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
89553 +
89554 +#define PMD_SIZE       (1UL << PMD_SHIFT)
89555 +#define PMD_MASK       (~(PMD_SIZE-1))
89556 +#define PUD_SIZE       (1UL << PUD_SHIFT)
89557 +#define PUD_MASK       (~(PUD_SIZE-1))
89558 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
89559 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
89560 +
89561 +#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
89562 +#define FIRST_USER_ADDRESS     0
89563 +
89564 +#ifndef __ASSEMBLY__
89565 +#define MAXMEM          0x3fffffffffffUL
89566 +#define VMALLOC_START    0xffffc20000000000UL
89567 +#define VMALLOC_END      0xffffe1ffffffffffUL
89568 +#define MODULES_VADDR    0xffffffff88000000UL
89569 +#define MODULES_END      0xfffffffffff00000UL
89570 +#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
89571 +
89572 +#define _PAGE_BIT_PRESENT      0
89573 +#define _PAGE_BIT_RW           1
89574 +#define _PAGE_BIT_USER         2
89575 +#define _PAGE_BIT_PWT          3
89576 +#define _PAGE_BIT_PCD          4
89577 +#define _PAGE_BIT_ACCESSED     5
89578 +#define _PAGE_BIT_DIRTY                6
89579 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
89580 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
89581 +#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
89582 +
89583 +#define _PAGE_PRESENT  0x001
89584 +#define _PAGE_RW       0x002
89585 +#define _PAGE_USER     0x004
89586 +#define _PAGE_PWT      0x008
89587 +#define _PAGE_PCD      0x010
89588 +#define _PAGE_ACCESSED 0x020
89589 +#define _PAGE_DIRTY    0x040
89590 +#define _PAGE_PSE      0x080   /* 2MB page */
89591 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
89592 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
89593 +
89594 +#define _PAGE_PROTNONE 0x080   /* If not present */
89595 +#define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
89596 +
89597 +#ifdef CONFIG_XEN_COMPAT_030002
89598 +extern unsigned int __kernel_page_user;
89599 +#else
89600 +#define __kernel_page_user 0
89601 +#endif
89602 +
89603 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
89604 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
89605 +
89606 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
89607 +
89608 +#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
89609 +#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
89610 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
89611 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
89612 +#define PAGE_COPY PAGE_COPY_NOEXEC
89613 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
89614 +#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
89615 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
89616 +#define __PAGE_KERNEL \
89617 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
89618 +#define __PAGE_KERNEL_EXEC \
89619 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
89620 +#define __PAGE_KERNEL_NOCACHE \
89621 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
89622 +#define __PAGE_KERNEL_RO \
89623 +       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
89624 +#define __PAGE_KERNEL_VSYSCALL \
89625 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
89626 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
89627 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
89628 +#define __PAGE_KERNEL_LARGE \
89629 +       (__PAGE_KERNEL | _PAGE_PSE)
89630 +#define __PAGE_KERNEL_LARGE_EXEC \
89631 +       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
89632 +
89633 +/*
89634 + * We don't support GLOBAL page in xenolinux64
89635 + */
89636 +#define MAKE_GLOBAL(x) __pgprot((x))
89637 +
89638 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
89639 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
89640 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
89641 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
89642 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
89643 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
89644 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
89645 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
89646 +
89647 +/*         xwr */
89648 +#define __P000 PAGE_NONE
89649 +#define __P001 PAGE_READONLY
89650 +#define __P010 PAGE_COPY
89651 +#define __P011 PAGE_COPY
89652 +#define __P100 PAGE_READONLY_EXEC
89653 +#define __P101 PAGE_READONLY_EXEC
89654 +#define __P110 PAGE_COPY_EXEC
89655 +#define __P111 PAGE_COPY_EXEC
89656 +
89657 +#define __S000 PAGE_NONE
89658 +#define __S001 PAGE_READONLY
89659 +#define __S010 PAGE_SHARED
89660 +#define __S011 PAGE_SHARED
89661 +#define __S100 PAGE_READONLY_EXEC
89662 +#define __S101 PAGE_READONLY_EXEC
89663 +#define __S110 PAGE_SHARED_EXEC
89664 +#define __S111 PAGE_SHARED_EXEC
89665 +
89666 +static inline unsigned long pgd_bad(pgd_t pgd) 
89667 +{ 
89668 +       unsigned long val = pgd_val(pgd);
89669 +       val &= ~PTE_MASK; 
89670 +       val &= ~(_PAGE_USER | _PAGE_DIRTY); 
89671 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);      
89672 +} 
89673 +
89674 +static inline unsigned long pud_bad(pud_t pud)
89675 +{
89676 +       unsigned long val = pud_val(pud);
89677 +       val &= ~PTE_MASK;
89678 +       val &= ~(_PAGE_USER | _PAGE_DIRTY);
89679 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
89680 +}
89681 +
89682 +#define pte_none(x)    (!(x).pte)
89683 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
89684 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
89685 +
89686 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))        /* FIXME: is this
89687 +                                                  right? */
89688 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
89689 +#define pte_pfn(x) mfn_to_local_pfn(pte_mfn(x))
89690 +#define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
89691 +
89692 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
89693 +{
89694 +       pte_t pte;
89695 +       (pte).pte = (pfn_to_mfn(page_nr) << PAGE_SHIFT);
89696 +       (pte).pte |= pgprot_val(pgprot);
89697 +       (pte).pte &= __supported_pte_mask;
89698 +       return pte;
89699 +}
89700 +
89701 +/*
89702 + * The following only work if pte_present() is true.
89703 + * Undefined behaviour if not..
89704 + */
89705 +#define __pte_val(x)   ((x).pte)
89706 +
89707 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
89708 +static inline int pte_user(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
89709 +static inline int pte_read(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
89710 +static inline int pte_exec(pte_t pte)          { return !(__pte_val(pte) & _PAGE_NX); }
89711 +static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
89712 +static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
89713 +static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
89714 +static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
89715 +static inline int pte_huge(pte_t pte)          { return __pte_val(pte) & _PAGE_PSE; }
89716 +
89717 +static inline pte_t pte_rdprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
89718 +static inline pte_t pte_exprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
89719 +static inline pte_t pte_mkclean(pte_t pte)     { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
89720 +static inline pte_t pte_mkold(pte_t pte)       { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
89721 +static inline pte_t pte_wrprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_RW; return pte; }
89722 +static inline pte_t pte_mkread(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
89723 +static inline pte_t pte_mkexec(pte_t pte)      { __pte_val(pte) &= ~_PAGE_NX; return pte; }
89724 +static inline pte_t pte_mkdirty(pte_t pte)     { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
89725 +static inline pte_t pte_mkyoung(pte_t pte)     { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
89726 +static inline pte_t pte_mkwrite(pte_t pte)     { __pte_val(pte) |= _PAGE_RW; return pte; }
89727 +static inline pte_t pte_mkhuge(pte_t pte)      { __pte_val(pte) |= _PAGE_PSE; return pte; }
89728 +static inline pte_t pte_clrhuge(pte_t pte)     { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
89729 +
89730 +struct vm_area_struct;
89731 +
89732 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
89733 +{
89734 +       pte_t pte = *ptep;
89735 +       int ret = pte_dirty(pte);
89736 +       if (ret)
89737 +               set_pte(ptep, pte_mkclean(pte));
89738 +       return ret;
89739 +}
89740 +
89741 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
89742 +{
89743 +       pte_t pte = *ptep;
89744 +       int ret = pte_young(pte);
89745 +       if (ret)
89746 +               set_pte(ptep, pte_mkold(pte));
89747 +       return ret;
89748 +}
89749 +
89750 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
89751 +{
89752 +       pte_t pte = *ptep;
89753 +       if (pte_write(pte))
89754 +               set_pte(ptep, pte_wrprotect(pte));
89755 +}
89756 +
89757 +/*
89758 + * Macro to mark a page protection value as "uncacheable".
89759 + */
89760 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
89761 +
89762 +static inline int pmd_large(pmd_t pte) { 
89763 +       return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
89764 +}      
89765 +
89766 +
89767 +/*
89768 + * Conversion functions: convert a page and protection to a page entry,
89769 + * and a page entry and page directory to the page they refer to.
89770 + */
89771 +
89772 +/*
89773 + * Level 4 access.
89774 + * Never use these in the common code.
89775 + */
89776 +#define pgd_page_vaddr(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
89777 +#define pgd_page(pgd)          (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
89778 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
89779 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
89780 +#define pgd_offset_k(address) (pgd_t *)(init_level4_pgt + pgd_index(address))
89781 +#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
89782 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
89783 +
89784 +/* PUD - Level3 access */
89785 +/* to find an entry in a page-table-directory. */
89786 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
89787 +#define pud_page(pud)          (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
89788 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
89789 +#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
89790 +extern inline int pud_present(pud_t pud)       { return !pud_none(pud); }
89791 +
89792 +/* Find correct pud via the hidden fourth level page level: */
89793 +
89794 +/* This accesses the reference page table of the boot cpu. 
89795 +   Other CPUs get synced lazily via the page fault handler. */
89796 +static inline pud_t *pud_offset_k(pgd_t *pgd, unsigned long address)
89797 +{
89798 +       return pud_offset(pgd_offset_k(address), address);
89799 +}
89800 +
89801 +/* PMD  - Level 2 access */
89802 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
89803 +#define pmd_page(pmd)          (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
89804 +
89805 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
89806 +#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
89807 +                        pmd_index(address))
89808 +#define pmd_none(x)    (!pmd_val(x))
89809 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
89810 +   can temporarily clear it. */
89811 +#define pmd_present(x) (pmd_val(x))
89812 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
89813 +#define pmd_bad(x) ((pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
89814 +                   != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
89815 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
89816 +#define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
89817 +
89818 +#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
89819 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
89820 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
89821 +
89822 +/* PTE - Level 1 access. */
89823 +
89824 +/* page, protection -> pte */
89825 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
89826 +#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
89827
89828 +/* physical address -> PTE */
89829 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
89830 +{ 
89831 +       pte_t pte;
89832 +       pte.pte = physpage | pgprot_val(pgprot); 
89833 +       pte.pte &= __supported_pte_mask;
89834 +       return pte; 
89835 +}
89836
89837 +/* Change flags of a PTE */
89838 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
89839 +{ 
89840 +        (pte).pte &= _PAGE_CHG_MASK;
89841 +       (pte).pte |= pgprot_val(newprot);
89842 +       (pte).pte &= __supported_pte_mask;
89843 +       return pte; 
89844 +}
89845 +
89846 +#define pte_index(address) \
89847 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
89848 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
89849 +                       pte_index(address))
89850 +
89851 +/* x86-64 always has all page tables mapped. */
89852 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
89853 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
89854 +#define pte_unmap(pte) /* NOP */
89855 +#define pte_unmap_nested(pte) /* NOP */ 
89856 +
89857 +#define update_mmu_cache(vma,address,pte) do { } while (0)
89858 +
89859 +/* We only update the dirty/accessed state if we set
89860 + * the dirty bit by hand in the kernel, since the hardware
89861 + * will do the accessed bit for us, and we don't want to
89862 + * race with other CPU's that might be updating the dirty
89863 + * bit at the same time. */
89864 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
89865 +#if 0
89866 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
89867 +       do {                                                              \
89868 +               if (__dirty) {                                            \
89869 +                       set_pte(__ptep, __entry);                         \
89870 +                       flush_tlb_page(__vma, __address);                 \
89871 +               }                                                         \
89872 +       } while (0)
89873 +#endif
89874 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
89875 +       do {                                                              \
89876 +               if (__dirty) {                                            \
89877 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
89878 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
89879 +                       } else {                                          \
89880 +                            xen_l1_entry_update((__ptep), (__entry)); \
89881 +                           flush_tlb_page((__vma), (__address));         \
89882 +                       }                                                 \
89883 +               }                                                         \
89884 +       } while (0)
89885 +
89886 +/* Encode and de-code a swap entry */
89887 +#define __swp_type(x)                  (((x).val >> 1) & 0x3f)
89888 +#define __swp_offset(x)                        ((x).val >> 8)
89889 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
89890 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val(pte) })
89891 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
89892 +
89893 +extern spinlock_t pgd_lock;
89894 +extern struct page *pgd_list;
89895 +void vmalloc_sync_all(void);
89896 +
89897 +#endif /* !__ASSEMBLY__ */
89898 +
89899 +extern int kern_addr_valid(unsigned long addr); 
89900 +
89901 +#define DOMID_LOCAL (0xFFFFU)
89902 +
89903 +int direct_remap_pfn_range(struct vm_area_struct *vma,
89904 +                            unsigned long address,
89905 +                            unsigned long mfn,
89906 +                            unsigned long size,
89907 +                            pgprot_t prot,
89908 +                            domid_t  domid);
89909 +
89910 +int direct_kernel_remap_pfn_range(unsigned long address, 
89911 +                                 unsigned long mfn,
89912 +                                 unsigned long size, 
89913 +                                 pgprot_t prot,
89914 +                                 domid_t  domid);
89915 +
89916 +int create_lookup_pte_addr(struct mm_struct *mm,
89917 +                           unsigned long address,
89918 +                           uint64_t *ptep);
89919 +
89920 +int touch_pte_range(struct mm_struct *mm,
89921 +                    unsigned long address,
89922 +                    unsigned long size);
89923 +
89924 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
89925 +               direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
89926 +
89927 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
89928 +#define GET_IOSPACE(pfn)               0
89929 +#define GET_PFN(pfn)                   (pfn)
89930 +
89931 +#define HAVE_ARCH_UNMAPPED_AREA
89932 +
89933 +#define pgtable_cache_init()   do { } while (0)
89934 +#define check_pgt_cache()      do { } while (0)
89935 +
89936 +#define PAGE_AGP    PAGE_KERNEL_NOCACHE
89937 +#define HAVE_PAGE_AGP 1
89938 +
89939 +/* fs/proc/kcore.c */
89940 +#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
89941 +#define        kc_offset_to_vaddr(o) \
89942 +   (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
89943 +
89944 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
89945 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
89946 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
89947 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
89948 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
89949 +#define __HAVE_ARCH_PTE_SAME
89950 +#include <asm-generic/pgtable.h>
89951 +
89952 +#endif /* _X86_64_PGTABLE_H */
89953 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/processor.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/processor.h
89954 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/processor.h    1970-01-01 00:00:00.000000000 +0000
89955 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/processor.h  2007-02-02 19:10:58.000000000 +0000
89956 @@ -0,0 +1,499 @@
89957 +/*
89958 + * include/asm-x86_64/processor.h
89959 + *
89960 + * Copyright (C) 1994 Linus Torvalds
89961 + */
89962 +
89963 +#ifndef __ASM_X86_64_PROCESSOR_H
89964 +#define __ASM_X86_64_PROCESSOR_H
89965 +
89966 +#include <asm/segment.h>
89967 +#include <asm/page.h>
89968 +#include <asm/types.h>
89969 +#include <asm/sigcontext.h>
89970 +#include <asm/cpufeature.h>
89971 +#include <linux/threads.h>
89972 +#include <asm/msr.h>
89973 +#include <asm/current.h>
89974 +#include <asm/system.h>
89975 +#include <asm/mmsegment.h>
89976 +#include <asm/percpu.h>
89977 +#include <linux/personality.h>
89978 +#include <linux/cpumask.h>
89979 +
89980 +#define TF_MASK                0x00000100
89981 +#define IF_MASK                0x00000200
89982 +#define IOPL_MASK      0x00003000
89983 +#define NT_MASK                0x00004000
89984 +#define VM_MASK                0x00020000
89985 +#define AC_MASK                0x00040000
89986 +#define VIF_MASK       0x00080000      /* virtual interrupt flag */
89987 +#define VIP_MASK       0x00100000      /* virtual interrupt pending */
89988 +#define ID_MASK                0x00200000
89989 +
89990 +#define desc_empty(desc) \
89991 +               (!((desc)->a | (desc)->b))
89992 +
89993 +#define desc_equal(desc1, desc2) \
89994 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
89995 +
89996 +/*
89997 + * Default implementation of macro that returns current
89998 + * instruction pointer ("program counter").
89999 + */
90000 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
90001 +
90002 +/*
90003 + *  CPU type and hardware bug flags. Kept separately for each CPU.
90004 + */
90005 +
90006 +struct cpuinfo_x86 {
90007 +       __u8    x86;            /* CPU family */
90008 +       __u8    x86_vendor;     /* CPU vendor */
90009 +       __u8    x86_model;
90010 +       __u8    x86_mask;
90011 +       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
90012 +       __u32   x86_capability[NCAPINTS];
90013 +       char    x86_vendor_id[16];
90014 +       char    x86_model_id[64];
90015 +       int     x86_cache_size;  /* in KB */
90016 +       int     x86_clflush_size;
90017 +       int     x86_cache_alignment;
90018 +       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
90019 +        __u8    x86_virt_bits, x86_phys_bits;
90020 +       __u8    x86_max_cores;  /* cpuid returned max cores value */
90021 +        __u32   x86_power;     
90022 +       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
90023 +       unsigned long loops_per_jiffy;
90024 +#ifdef CONFIG_SMP
90025 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
90026 +#endif
90027 +       __u8    apicid;
90028 +#ifdef CONFIG_SMP
90029 +       __u8    booted_cores;   /* number of cores as seen by OS */
90030 +       __u8    phys_proc_id;   /* Physical Processor id. */
90031 +       __u8    cpu_core_id;    /* Core id. */
90032 +#endif
90033 +} ____cacheline_aligned;
90034 +
90035 +#define X86_VENDOR_INTEL 0
90036 +#define X86_VENDOR_CYRIX 1
90037 +#define X86_VENDOR_AMD 2
90038 +#define X86_VENDOR_UMC 3
90039 +#define X86_VENDOR_NEXGEN 4
90040 +#define X86_VENDOR_CENTAUR 5
90041 +#define X86_VENDOR_RISE 6
90042 +#define X86_VENDOR_TRANSMETA 7
90043 +#define X86_VENDOR_NUM 8
90044 +#define X86_VENDOR_UNKNOWN 0xff
90045 +
90046 +#ifdef CONFIG_SMP
90047 +extern struct cpuinfo_x86 cpu_data[];
90048 +#define current_cpu_data cpu_data[smp_processor_id()]
90049 +#else
90050 +#define cpu_data (&boot_cpu_data)
90051 +#define current_cpu_data boot_cpu_data
90052 +#endif
90053 +
90054 +extern char ignore_irq13;
90055 +
90056 +extern void identify_cpu(struct cpuinfo_x86 *);
90057 +extern void print_cpu_info(struct cpuinfo_x86 *);
90058 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
90059 +extern unsigned short num_cache_leaves;
90060 +
90061 +/*
90062 + * EFLAGS bits
90063 + */
90064 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
90065 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
90066 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
90067 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
90068 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
90069 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
90070 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
90071 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
90072 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
90073 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
90074 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
90075 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
90076 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
90077 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
90078 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
90079 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
90080 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
90081 +
90082 +/*
90083 + * Intel CPU features in CR4
90084 + */
90085 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
90086 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
90087 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
90088 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
90089 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
90090 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
90091 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
90092 +#define X86_CR4_PGE            0x0080  /* enable global pages */
90093 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
90094 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
90095 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
90096 +
90097 +/*
90098 + * Save the cr4 feature set we're using (ie
90099 + * Pentium 4MB enable and PPro Global page
90100 + * enable), so that any CPU's that boot up
90101 + * after us can get the correct flags.
90102 + */
90103 +extern unsigned long mmu_cr4_features;
90104 +
90105 +static inline void set_in_cr4 (unsigned long mask)
90106 +{
90107 +       mmu_cr4_features |= mask;
90108 +       __asm__("movq %%cr4,%%rax\n\t"
90109 +               "orq %0,%%rax\n\t"
90110 +               "movq %%rax,%%cr4\n"
90111 +               : : "irg" (mask)
90112 +               :"ax");
90113 +}
90114 +
90115 +static inline void clear_in_cr4 (unsigned long mask)
90116 +{
90117 +       mmu_cr4_features &= ~mask;
90118 +       __asm__("movq %%cr4,%%rax\n\t"
90119 +               "andq %0,%%rax\n\t"
90120 +               "movq %%rax,%%cr4\n"
90121 +               : : "irg" (~mask)
90122 +               :"ax");
90123 +}
90124 +
90125 +
90126 +/*
90127 + * User space process size. 47bits minus one guard page.
90128 + */
90129 +#define TASK_SIZE64    (0x800000000000UL - 4096)
90130 +
90131 +/* This decides where the kernel will search for a free chunk of vm
90132 + * space during mmap's.
90133 + */
90134 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
90135 +
90136 +#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
90137 +#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
90138 +
90139 +#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
90140 +
90141 +/*
90142 + * Size of io_bitmap.
90143 + */
90144 +#define IO_BITMAP_BITS  65536
90145 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
90146 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
90147 +#ifndef CONFIG_X86_NO_TSS
90148 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
90149 +#endif
90150 +#define INVALID_IO_BITMAP_OFFSET 0x8000
90151 +
90152 +struct i387_fxsave_struct {
90153 +       u16     cwd;
90154 +       u16     swd;
90155 +       u16     twd;
90156 +       u16     fop;
90157 +       u64     rip;
90158 +       u64     rdp; 
90159 +       u32     mxcsr;
90160 +       u32     mxcsr_mask;
90161 +       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
90162 +       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 128 bytes */
90163 +       u32     padding[24];
90164 +} __attribute__ ((aligned (16)));
90165 +
90166 +union i387_union {
90167 +       struct i387_fxsave_struct       fxsave;
90168 +};
90169 +
90170 +#ifndef CONFIG_X86_NO_TSS
90171 +struct tss_struct {
90172 +       u32 reserved1;
90173 +       u64 rsp0;       
90174 +       u64 rsp1;
90175 +       u64 rsp2;
90176 +       u64 reserved2;
90177 +       u64 ist[7];
90178 +       u32 reserved3;
90179 +       u32 reserved4;
90180 +       u16 reserved5;
90181 +       u16 io_bitmap_base;
90182 +       /*
90183 +        * The extra 1 is there because the CPU will access an
90184 +        * additional byte beyond the end of the IO permission
90185 +        * bitmap. The extra byte must be all 1 bits, and must
90186 +        * be within the limit. Thus we have:
90187 +        *
90188 +        * 128 bytes, the bitmap itself, for ports 0..0x3ff
90189 +        * 8 bytes, for an extra "long" of ~0UL
90190 +        */
90191 +       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
90192 +} __attribute__((packed)) ____cacheline_aligned;
90193 +
90194 +DECLARE_PER_CPU(struct tss_struct,init_tss);
90195 +/* Save the original ist values for checking stack pointers during debugging */
90196 +#endif
90197 +
90198 +extern struct cpuinfo_x86 boot_cpu_data;
90199 +struct orig_ist {
90200 +       unsigned long ist[7];
90201 +};
90202 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
90203 +
90204 +#ifdef CONFIG_X86_VSMP
90205 +#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
90206 +#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
90207 +#else
90208 +#define ARCH_MIN_TASKALIGN     16
90209 +#define ARCH_MIN_MMSTRUCT_ALIGN        0
90210 +#endif
90211 +
90212 +struct thread_struct {
90213 +       unsigned long   rsp0;
90214 +       unsigned long   rsp;
90215 +       unsigned long   userrsp;        /* Copy from PDA */ 
90216 +       unsigned long   fs;
90217 +       unsigned long   gs;
90218 +       unsigned short  es, ds, fsindex, gsindex;       
90219 +/* Hardware debugging registers */
90220 +       unsigned long   debugreg0;  
90221 +       unsigned long   debugreg1;  
90222 +       unsigned long   debugreg2;  
90223 +       unsigned long   debugreg3;  
90224 +       unsigned long   debugreg6;  
90225 +       unsigned long   debugreg7;  
90226 +/* fault info */
90227 +       unsigned long   cr2, trap_no, error_code;
90228 +/* floating point info */
90229 +       union i387_union        i387  __attribute__((aligned(16)));
90230 +/* IO permissions. the bitmap could be moved into the GDT, that would make
90231 +   switch faster for a limited number of ioperm using tasks. -AK */
90232 +       int             ioperm;
90233 +       unsigned long   *io_bitmap_ptr;
90234 +       unsigned io_bitmap_max;
90235 +/* cached TLS descriptors. */
90236 +       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
90237 +       unsigned int    iopl;
90238 +} __attribute__((aligned(16)));
90239 +
90240 +#define INIT_THREAD  { \
90241 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
90242 +}
90243 +
90244 +#ifndef CONFIG_X86_NO_TSS
90245 +#define INIT_TSS  { \
90246 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
90247 +}
90248 +#endif
90249 +
90250 +#define INIT_MMAP \
90251 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
90252 +
90253 +#define start_thread(regs,new_rip,new_rsp) do { \
90254 +       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
90255 +       load_gs_index(0);                                                       \
90256 +       (regs)->rip = (new_rip);                                                 \
90257 +       (regs)->rsp = (new_rsp);                                                 \
90258 +       write_pda(oldrsp, (new_rsp));                                            \
90259 +       (regs)->cs = __USER_CS;                                                  \
90260 +       (regs)->ss = __USER_DS;                                                  \
90261 +       (regs)->eflags = 0x200;                                                  \
90262 +       set_fs(USER_DS);                                                         \
90263 +} while(0) 
90264 +
90265 +#define get_debugreg(var, register)                            \
90266 +       var = HYPERVISOR_get_debugreg(register)
90267 +#define set_debugreg(value, register)                  \
90268 +       HYPERVISOR_set_debugreg(register, value)
90269 +
90270 +struct task_struct;
90271 +struct mm_struct;
90272 +
90273 +/* Free all resources held by a thread. */
90274 +extern void release_thread(struct task_struct *);
90275 +
90276 +/* Prepare to copy thread state - unlazy all lazy status */
90277 +extern void prepare_to_copy(struct task_struct *tsk);
90278 +
90279 +/*
90280 + * create a kernel thread without removing it from tasklists
90281 + */
90282 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
90283 +
90284 +/*
90285 + * Return saved PC of a blocked thread.
90286 + * What is this good for? it will be always the scheduler or ret_from_fork.
90287 + */
90288 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
90289 +
90290 +extern unsigned long get_wchan(struct task_struct *p);
90291 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
90292 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
90293 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
90294 +
90295 +
90296 +struct microcode_header {
90297 +       unsigned int hdrver;
90298 +       unsigned int rev;
90299 +       unsigned int date;
90300 +       unsigned int sig;
90301 +       unsigned int cksum;
90302 +       unsigned int ldrver;
90303 +       unsigned int pf;
90304 +       unsigned int datasize;
90305 +       unsigned int totalsize;
90306 +       unsigned int reserved[3];
90307 +};
90308 +
90309 +struct microcode {
90310 +       struct microcode_header hdr;
90311 +       unsigned int bits[0];
90312 +};
90313 +
90314 +typedef struct microcode microcode_t;
90315 +typedef struct microcode_header microcode_header_t;
90316 +
90317 +/* microcode format is extended from prescott processors */
90318 +struct extended_signature {
90319 +       unsigned int sig;
90320 +       unsigned int pf;
90321 +       unsigned int cksum;
90322 +};
90323 +
90324 +struct extended_sigtable {
90325 +       unsigned int count;
90326 +       unsigned int cksum;
90327 +       unsigned int reserved[3];
90328 +       struct extended_signature sigs[0];
90329 +};
90330 +
90331 +
90332 +#define ASM_NOP1 K8_NOP1
90333 +#define ASM_NOP2 K8_NOP2
90334 +#define ASM_NOP3 K8_NOP3
90335 +#define ASM_NOP4 K8_NOP4
90336 +#define ASM_NOP5 K8_NOP5
90337 +#define ASM_NOP6 K8_NOP6
90338 +#define ASM_NOP7 K8_NOP7
90339 +#define ASM_NOP8 K8_NOP8
90340 +
90341 +/* Opteron nops */
90342 +#define K8_NOP1 ".byte 0x90\n"
90343 +#define K8_NOP2        ".byte 0x66,0x90\n" 
90344 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
90345 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
90346 +#define K8_NOP5        K8_NOP3 K8_NOP2 
90347 +#define K8_NOP6        K8_NOP3 K8_NOP3
90348 +#define K8_NOP7        K8_NOP4 K8_NOP3
90349 +#define K8_NOP8        K8_NOP4 K8_NOP4
90350 +
90351 +#define ASM_NOP_MAX 8
90352 +
90353 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
90354 +static inline void rep_nop(void)
90355 +{
90356 +       __asm__ __volatile__("rep;nop": : :"memory");
90357 +}
90358 +
90359 +/* Stop speculative execution */
90360 +static inline void sync_core(void)
90361 +{ 
90362 +       int tmp;
90363 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
90364 +} 
90365 +
90366 +#define cpu_has_fpu 1
90367 +
90368 +#define ARCH_HAS_PREFETCH
90369 +static inline void prefetch(void *x) 
90370 +{ 
90371 +       asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
90372 +} 
90373 +
90374 +#define ARCH_HAS_PREFETCHW 1
90375 +static inline void prefetchw(void *x) 
90376 +{ 
90377 +       alternative_input("prefetcht0 (%1)",
90378 +                         "prefetchw (%1)",
90379 +                         X86_FEATURE_3DNOW,
90380 +                         "r" (x));
90381 +} 
90382 +
90383 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
90384 +
90385 +#define spin_lock_prefetch(x)  prefetchw(x)
90386 +
90387 +#define cpu_relax()   rep_nop()
90388 +
90389 +/*
90390 + *      NSC/Cyrix CPU configuration register indexes
90391 + */
90392 +#define CX86_CCR0 0xc0
90393 +#define CX86_CCR1 0xc1
90394 +#define CX86_CCR2 0xc2
90395 +#define CX86_CCR3 0xc3
90396 +#define CX86_CCR4 0xe8
90397 +#define CX86_CCR5 0xe9
90398 +#define CX86_CCR6 0xea
90399 +#define CX86_CCR7 0xeb
90400 +#define CX86_DIR0 0xfe
90401 +#define CX86_DIR1 0xff
90402 +#define CX86_ARR_BASE 0xc4
90403 +#define CX86_RCR_BASE 0xdc
90404 +
90405 +/*
90406 + *      NSC/Cyrix CPU indexed register access macros
90407 + */
90408 +
90409 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
90410 +
90411 +#define setCx86(reg, data) do { \
90412 +       outb((reg), 0x22); \
90413 +       outb((data), 0x23); \
90414 +} while (0)
90415 +
90416 +static inline void serialize_cpu(void)
90417 +{
90418 +       __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
90419 +}
90420 +
90421 +static inline void __monitor(const void *eax, unsigned long ecx,
90422 +               unsigned long edx)
90423 +{
90424 +       /* "monitor %eax,%ecx,%edx;" */
90425 +       asm volatile(
90426 +               ".byte 0x0f,0x01,0xc8;"
90427 +               : :"a" (eax), "c" (ecx), "d"(edx));
90428 +}
90429 +
90430 +static inline void __mwait(unsigned long eax, unsigned long ecx)
90431 +{
90432 +       /* "mwait %eax,%ecx;" */
90433 +       asm volatile(
90434 +               ".byte 0x0f,0x01,0xc9;"
90435 +               : :"a" (eax), "c" (ecx));
90436 +}
90437 +
90438 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
90439 +
90440 +#define stack_current() \
90441 +({                                                             \
90442 +       struct thread_info *ti;                                 \
90443 +       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
90444 +       ti->task;                                       \
90445 +})
90446 +
90447 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
90448 +
90449 +extern unsigned long boot_option_idle_override;
90450 +/* Boot loader type from the setup header */
90451 +extern int bootloader_type;
90452 +
90453 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
90454 +
90455 +#endif /* __ASM_X86_64_PROCESSOR_H */
90456 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/smp.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/smp.h
90457 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/smp.h  1970-01-01 00:00:00.000000000 +0000
90458 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/smp.h        2007-02-02 19:10:58.000000000 +0000
90459 @@ -0,0 +1,132 @@
90460 +#ifndef __ASM_SMP_H
90461 +#define __ASM_SMP_H
90462 +
90463 +/*
90464 + * We need the APIC definitions automatically as part of 'smp.h'
90465 + */
90466 +#include <linux/threads.h>
90467 +#include <linux/cpumask.h>
90468 +#include <linux/bitops.h>
90469 +extern int disable_apic;
90470 +
90471 +#include <asm/fixmap.h>
90472 +#include <asm/mpspec.h>
90473 +#include <asm/io_apic.h>
90474 +#include <asm/apic.h>
90475 +#include <asm/thread_info.h>
90476 +
90477 +#ifdef CONFIG_SMP
90478 +
90479 +#include <asm/pda.h>
90480 +
90481 +struct pt_regs;
90482 +
90483 +extern cpumask_t cpu_present_mask;
90484 +extern cpumask_t cpu_possible_map;
90485 +extern cpumask_t cpu_online_map;
90486 +extern cpumask_t cpu_initialized;
90487 +
90488 +/*
90489 + * Private routines/data
90490 + */
90491
90492 +extern void smp_alloc_memory(void);
90493 +extern volatile unsigned long smp_invalidate_needed;
90494 +extern void lock_ipi_call_lock(void);
90495 +extern void unlock_ipi_call_lock(void);
90496 +extern int smp_num_siblings;
90497 +extern void smp_send_reschedule(int cpu);
90498 +void smp_stop_cpu(void);
90499 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
90500 +                               void *info, int retry, int wait);
90501 +
90502 +extern cpumask_t cpu_sibling_map[NR_CPUS];
90503 +extern cpumask_t cpu_core_map[NR_CPUS];
90504 +extern u8 cpu_llc_id[NR_CPUS];
90505 +
90506 +#define SMP_TRAMPOLINE_BASE 0x6000
90507 +
90508 +/*
90509 + * On x86 all CPUs are mapped 1:1 to the APIC space.
90510 + * This simplifies scheduling and IPI sending and
90511 + * compresses data structures.
90512 + */
90513 +
90514 +static inline int num_booting_cpus(void)
90515 +{
90516 +       return cpus_weight(cpu_possible_map);
90517 +}
90518 +
90519 +#define raw_smp_processor_id() read_pda(cpunumber)
90520 +
90521 +static inline int hard_smp_processor_id(void)
90522 +{
90523 +       /* we don't want to mark this access volatile - bad code generation */
90524 +       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
90525 +}
90526 +
90527 +extern int __cpu_disable(void);
90528 +extern void __cpu_die(unsigned int cpu);
90529 +extern void prefill_possible_map(void);
90530 +extern unsigned num_processors;
90531 +extern unsigned disabled_cpus;
90532 +
90533 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
90534 +
90535 +#endif
90536 +
90537 +/*
90538 + * Some lowlevel functions might want to know about
90539 + * the real APIC ID <-> CPU # mapping.
90540 + */
90541 +extern u8 x86_cpu_to_apicid[NR_CPUS];  /* physical ID */
90542 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
90543 +extern u8 bios_cpu_apicid[];
90544 +
90545 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
90546 +{
90547 +       return cpus_addr(cpumask)[0];
90548 +}
90549 +
90550 +static inline int cpu_present_to_apicid(int mps_cpu)
90551 +{
90552 +       if (mps_cpu < NR_CPUS)
90553 +               return (int)bios_cpu_apicid[mps_cpu];
90554 +       else
90555 +               return BAD_APICID;
90556 +}
90557 +
90558 +#ifndef CONFIG_SMP
90559 +#define stack_smp_processor_id() 0
90560 +#define cpu_logical_map(x) (x)
90561 +#else
90562 +#include <asm/thread_info.h>
90563 +#define stack_smp_processor_id() \
90564 +({                                                             \
90565 +       struct thread_info *ti;                                 \
90566 +       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
90567 +       ti->cpu;                                                \
90568 +})
90569 +#endif
90570 +
90571 +static __inline int logical_smp_processor_id(void)
90572 +{
90573 +       /* we don't want to mark this access volatile - bad code generation */
90574 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
90575 +}
90576 +
90577 +#ifdef CONFIG_SMP
90578 +#define cpu_physical_id(cpu)           x86_cpu_to_apicid[cpu]
90579 +#else
90580 +#define cpu_physical_id(cpu)           boot_cpu_id
90581 +static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
90582 +                               void *info, int retry, int wait)
90583 +{
90584 +       /* Disable interrupts here? */
90585 +       func(info);
90586 +       return 0;
90587 +}
90588 +#endif /* !CONFIG_SMP */
90589 +#endif
90590 +
90591 +
90592 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/synch_bitops.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/synch_bitops.h
90593 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/synch_bitops.h 1970-01-01 00:00:00.000000000 +0000
90594 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/synch_bitops.h       2007-02-02 19:10:58.000000000 +0000
90595 @@ -0,0 +1,2 @@
90596 +
90597 +#include <asm-i386/mach-xen/asm/synch_bitops.h>
90598 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/system.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/system.h
90599 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/system.h       1970-01-01 00:00:00.000000000 +0000
90600 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/system.h     2007-02-02 19:10:58.000000000 +0000
90601 @@ -0,0 +1,266 @@
90602 +#ifndef __ASM_SYSTEM_H
90603 +#define __ASM_SYSTEM_H
90604 +
90605 +#include <linux/kernel.h>
90606 +#include <asm/segment.h>
90607 +#include <asm/alternative.h>
90608 +#include <asm/synch_bitops.h>
90609 +#include <asm/hypervisor.h>
90610 +#include <xen/interface/arch-x86_64.h>
90611 +
90612 +#ifdef __KERNEL__
90613 +
90614 +#ifdef CONFIG_SMP
90615 +#define __vcpu_id smp_processor_id()
90616 +#else
90617 +#define __vcpu_id 0
90618 +#endif
90619 +
90620 +#define __STR(x) #x
90621 +#define STR(x) __STR(x)
90622 +
90623 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
90624 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
90625 +
90626 +/* frame pointer must be last for get_wchan */
90627 +#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
90628 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
90629 +
90630 +#define __EXTRA_CLOBBER  \
90631 +       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
90632 +
90633 +/* Save restore flags to clear handle leaking NT */
90634 +#define switch_to(prev,next,last) \
90635 +       asm volatile(SAVE_CONTEXT                                                   \
90636 +                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
90637 +                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
90638 +                    "call __switch_to\n\t"                                       \
90639 +                    ".globl thread_return\n"                                   \
90640 +                    "thread_return:\n\t"                                           \
90641 +                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
90642 +                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
90643 +                    LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
90644 +                    "movq %%rax,%%rdi\n\t"                                       \
90645 +                    "jc   ret_from_fork\n\t"                                     \
90646 +                    RESTORE_CONTEXT                                                \
90647 +                    : "=a" (last)                                                \
90648 +                    : [next] "S" (next), [prev] "D" (prev),                      \
90649 +                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
90650 +                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
90651 +                      [tif_fork] "i" (TIF_FORK),                         \
90652 +                      [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
90653 +                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
90654 +                    : "memory", "cc" __EXTRA_CLOBBER)
90655 +    
90656 +extern void load_gs_index(unsigned); 
90657 +
90658 +/*
90659 + * Load a segment. Fall back on loading the zero
90660 + * segment if something goes wrong..
90661 + */
90662 +#define loadsegment(seg,value) \
90663 +       asm volatile("\n"                       \
90664 +               "1:\t"                          \
90665 +               "movl %k0,%%" #seg "\n"         \
90666 +               "2:\n"                          \
90667 +               ".section .fixup,\"ax\"\n"      \
90668 +               "3:\t"                          \
90669 +               "movl %1,%%" #seg "\n\t"        \
90670 +               "jmp 2b\n"                      \
90671 +               ".previous\n"                   \
90672 +               ".section __ex_table,\"a\"\n\t" \
90673 +               ".align 8\n\t"                  \
90674 +               ".quad 1b,3b\n"                 \
90675 +               ".previous"                     \
90676 +               : :"r" (value), "r" (0))
90677 +
90678 +/*
90679 + * Clear and set 'TS' bit respectively
90680 + */
90681 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
90682 +
90683 +static inline unsigned long read_cr0(void)
90684 +{ 
90685 +       unsigned long cr0;
90686 +       asm volatile("movq %%cr0,%0" : "=r" (cr0));
90687 +       return cr0;
90688 +} 
90689 +
90690 +static inline void write_cr0(unsigned long val) 
90691 +{ 
90692 +       asm volatile("movq %0,%%cr0" :: "r" (val));
90693 +} 
90694 +
90695 +static inline unsigned long read_cr3(void)
90696 +{ 
90697 +       unsigned long cr3;
90698 +       asm("movq %%cr3,%0" : "=r" (cr3));
90699 +       return machine_to_phys(cr3);
90700 +} 
90701 +
90702 +static inline unsigned long read_cr4(void)
90703 +{ 
90704 +       unsigned long cr4;
90705 +       asm("movq %%cr4,%0" : "=r" (cr4));
90706 +       return cr4;
90707 +} 
90708 +
90709 +static inline void write_cr4(unsigned long val)
90710 +{ 
90711 +       asm volatile("movq %0,%%cr4" :: "r" (val));
90712 +} 
90713 +
90714 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
90715 +
90716 +#define wbinvd() \
90717 +       __asm__ __volatile__ ("wbinvd": : :"memory");
90718 +
90719 +/*
90720 + * On SMP systems, when the scheduler does migration-cost autodetection,
90721 + * it needs a way to flush as much of the CPU's caches as possible.
90722 + */
90723 +static inline void sched_cacheflush(void)
90724 +{
90725 +       wbinvd();
90726 +}
90727 +
90728 +#endif /* __KERNEL__ */
90729 +
90730 +#define nop() __asm__ __volatile__ ("nop")
90731 +
90732 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
90733 +
90734 +#define tas(ptr) (xchg((ptr),1))
90735 +
90736 +#define __xg(x) ((volatile long *)(x))
90737 +
90738 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
90739 +{
90740 +       *ptr = val;
90741 +}
90742 +
90743 +#define _set_64bit set_64bit
90744 +
90745 +/*
90746 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
90747 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
90748 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
90749 + */
90750 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
90751 +{
90752 +       switch (size) {
90753 +               case 1:
90754 +                       __asm__ __volatile__("xchgb %b0,%1"
90755 +                               :"=q" (x)
90756 +                               :"m" (*__xg(ptr)), "0" (x)
90757 +                               :"memory");
90758 +                       break;
90759 +               case 2:
90760 +                       __asm__ __volatile__("xchgw %w0,%1"
90761 +                               :"=r" (x)
90762 +                               :"m" (*__xg(ptr)), "0" (x)
90763 +                               :"memory");
90764 +                       break;
90765 +               case 4:
90766 +                       __asm__ __volatile__("xchgl %k0,%1"
90767 +                               :"=r" (x)
90768 +                               :"m" (*__xg(ptr)), "0" (x)
90769 +                               :"memory");
90770 +                       break;
90771 +               case 8:
90772 +                       __asm__ __volatile__("xchgq %0,%1"
90773 +                               :"=r" (x)
90774 +                               :"m" (*__xg(ptr)), "0" (x)
90775 +                               :"memory");
90776 +                       break;
90777 +       }
90778 +       return x;
90779 +}
90780 +
90781 +/*
90782 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
90783 + * store NEW in MEM.  Return the initial value in MEM.  Success is
90784 + * indicated by comparing RETURN with OLD.
90785 + */
90786 +
90787 +#define __HAVE_ARCH_CMPXCHG 1
90788 +
90789 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
90790 +                                     unsigned long new, int size)
90791 +{
90792 +       unsigned long prev;
90793 +       switch (size) {
90794 +       case 1:
90795 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
90796 +                                    : "=a"(prev)
90797 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
90798 +                                    : "memory");
90799 +               return prev;
90800 +       case 2:
90801 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
90802 +                                    : "=a"(prev)
90803 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
90804 +                                    : "memory");
90805 +               return prev;
90806 +       case 4:
90807 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
90808 +                                    : "=a"(prev)
90809 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
90810 +                                    : "memory");
90811 +               return prev;
90812 +       case 8:
90813 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
90814 +                                    : "=a"(prev)
90815 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
90816 +                                    : "memory");
90817 +               return prev;
90818 +       }
90819 +       return old;
90820 +}
90821 +
90822 +#define cmpxchg(ptr,o,n)\
90823 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
90824 +                                       (unsigned long)(n),sizeof(*(ptr))))
90825 +
90826 +#ifdef CONFIG_SMP
90827 +#define smp_mb()       mb()
90828 +#define smp_rmb()      rmb()
90829 +#define smp_wmb()      wmb()
90830 +#define smp_read_barrier_depends()     do {} while(0)
90831 +#else
90832 +#define smp_mb()       barrier()
90833 +#define smp_rmb()      barrier()
90834 +#define smp_wmb()      barrier()
90835 +#define smp_read_barrier_depends()     do {} while(0)
90836 +#endif
90837 +
90838 +    
90839 +/*
90840 + * Force strict CPU ordering.
90841 + * And yes, this is required on UP too when we're talking
90842 + * to devices.
90843 + */
90844 +#define mb()   asm volatile("mfence":::"memory")
90845 +#define rmb()  asm volatile("lfence":::"memory")
90846 +
90847 +#ifdef CONFIG_UNORDERED_IO
90848 +#define wmb()  asm volatile("sfence" ::: "memory")
90849 +#else
90850 +#define wmb()  asm volatile("" ::: "memory")
90851 +#endif
90852 +#define read_barrier_depends() do {} while(0)
90853 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
90854 +
90855 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
90856 +
90857 +void safe_halt(void);
90858 +void halt(void);
90859 +
90860 +#include <linux/irqflags.h>
90861 +
90862 +void cpu_idle_wait(void);
90863 +
90864 +extern unsigned long arch_align_stack(unsigned long sp);
90865 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
90866 +
90867 +#endif
90868 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/timer.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/timer.h
90869 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/timer.h        1970-01-01 00:00:00.000000000 +0000
90870 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/timer.h      2007-02-02 19:10:58.000000000 +0000
90871 @@ -0,0 +1,67 @@
90872 +#ifndef _ASMi386_TIMER_H
90873 +#define _ASMi386_TIMER_H
90874 +#include <linux/init.h>
90875 +
90876 +/**
90877 + * struct timer_ops - used to define a timer source
90878 + *
90879 + * @name: name of the timer.
90880 + * @init: Probes and initializes the timer. Takes clock= override 
90881 + *        string as an argument. Returns 0 on success, anything else
90882 + *        on failure.
90883 + * @mark_offset: called by the timer interrupt.
90884 + * @get_offset:  called by gettimeofday(). Returns the number of microseconds
90885 + *               since the last timer interupt.
90886 + * @monotonic_clock: returns the number of nanoseconds since the init of the
90887 + *                   timer.
90888 + * @delay: delays this many clock cycles.
90889 + */
90890 +struct timer_opts {
90891 +       char* name;
90892 +       void (*mark_offset)(void);
90893 +       unsigned long (*get_offset)(void);
90894 +       unsigned long long (*monotonic_clock)(void);
90895 +       void (*delay)(unsigned long);
90896 +       unsigned long (*read_timer)(void);
90897 +       int (*suspend)(pm_message_t state);
90898 +       int (*resume)(void);
90899 +};
90900 +
90901 +struct init_timer_opts {
90902 +       int (*init)(char *override);
90903 +       struct timer_opts *opts;
90904 +};
90905 +
90906 +#define TICK_SIZE (tick_nsec / 1000)
90907 +
90908 +extern struct timer_opts* __init select_timer(void);
90909 +extern void clock_fallback(void);
90910 +void setup_pit_timer(void);
90911 +
90912 +/* Modifiers for buggy PIT handling */
90913 +
90914 +extern int pit_latch_buggy;
90915 +
90916 +extern struct timer_opts *cur_timer;
90917 +extern int timer_ack;
90918 +
90919 +/* list of externed timers */
90920 +extern struct timer_opts timer_none;
90921 +extern struct timer_opts timer_pit;
90922 +extern struct init_timer_opts timer_pit_init;
90923 +extern struct init_timer_opts timer_tsc_init;
90924 +#ifdef CONFIG_X86_CYCLONE_TIMER
90925 +extern struct init_timer_opts timer_cyclone_init;
90926 +#endif
90927 +
90928 +extern unsigned long calibrate_tsc(void);
90929 +extern void init_cpu_khz(void);
90930 +#ifdef CONFIG_HPET_TIMER
90931 +extern struct init_timer_opts timer_hpet_init;
90932 +extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
90933 +#endif
90934 +
90935 +#ifdef CONFIG_X86_PM_TIMER
90936 +extern struct init_timer_opts timer_pmtmr_init;
90937 +#endif
90938 +#endif
90939 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/tlbflush.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/tlbflush.h
90940 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/tlbflush.h     1970-01-01 00:00:00.000000000 +0000
90941 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/tlbflush.h   2007-02-02 19:10:58.000000000 +0000
90942 @@ -0,0 +1,120 @@
90943 +#ifndef _X8664_TLBFLUSH_H
90944 +#define _X8664_TLBFLUSH_H
90945 +
90946 +#include <linux/mm.h>
90947 +#include <asm/processor.h>
90948 +
90949 +static inline unsigned long get_cr3(void)
90950 +{
90951 +       unsigned long cr3;
90952 +       asm volatile("mov %%cr3,%0" : "=r" (cr3));
90953 +       return machine_to_phys(cr3);
90954 +}
90955 +
90956 +static inline void set_cr3(unsigned long cr3)
90957 +{
90958 +       BUG();
90959 +       /* What the hell is this supposed to do: JQ */
90960 +       asm volatile("mov %0,%%cr3" :: "r" (cr3) : "memory");
90961 +}
90962 +
90963 +#define __flush_tlb()  xen_tlb_flush()
90964 +
90965 +static inline unsigned long get_cr4(void)
90966 +{
90967 +       unsigned long cr4;
90968 +       asm volatile("mov %%cr4,%0" : "=r" (cr4));
90969 +       return cr4;
90970 +}
90971 +
90972 +static inline void set_cr4(unsigned long cr4)
90973 +{
90974 +       asm volatile("mov %0,%%cr4" :: "r" (cr4) : "memory");
90975 +}
90976 +
90977 +#define __flush_tlb_all() xen_tlb_flush()
90978 +
90979 +#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
90980 +
90981 +
90982 +/*
90983 + * TLB flushing:
90984 + *
90985 + *  - flush_tlb() flushes the current mm struct TLBs
90986 + *  - flush_tlb_all() flushes all processes TLBs
90987 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
90988 + *  - flush_tlb_page(vma, vmaddr) flushes one page
90989 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
90990 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
90991 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
90992 + *
90993 + * x86-64 can only flush individual pages or full VMs. For a range flush
90994 + * we always do the full VM. Might be worth trying if for a small
90995 + * range a few INVLPGs in a row are a win.
90996 + */
90997 +
90998 +#ifndef CONFIG_SMP
90999 +
91000 +#define flush_tlb() __flush_tlb()
91001 +#define flush_tlb_all() __flush_tlb_all()
91002 +#define local_flush_tlb() __flush_tlb()
91003 +
91004 +static inline void flush_tlb_mm(struct mm_struct *mm)
91005 +{
91006 +       if (mm == current->active_mm)
91007 +               __flush_tlb();
91008 +}
91009 +
91010 +static inline void flush_tlb_page(struct vm_area_struct *vma,
91011 +       unsigned long addr)
91012 +{
91013 +       if (vma->vm_mm == current->active_mm)
91014 +               __flush_tlb_one(addr);
91015 +}
91016 +
91017 +static inline void flush_tlb_range(struct vm_area_struct *vma,
91018 +       unsigned long start, unsigned long end)
91019 +{
91020 +       if (vma->vm_mm == current->active_mm)
91021 +               __flush_tlb();
91022 +}
91023 +
91024 +#else
91025 +
91026 +#include <asm/smp.h>
91027 +
91028 +#define local_flush_tlb() \
91029 +       __flush_tlb()
91030 +
91031 +extern void flush_tlb_all(void);
91032 +extern void flush_tlb_current_task(void);
91033 +extern void flush_tlb_mm(struct mm_struct *);
91034 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
91035 +
91036 +#define flush_tlb()    flush_tlb_current_task()
91037 +
91038 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
91039 +{
91040 +       flush_tlb_mm(vma->vm_mm);
91041 +}
91042 +
91043 +#define TLBSTATE_OK    1
91044 +#define TLBSTATE_LAZY  2
91045 +
91046 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
91047 +   ranges. Cost is about 42k of memory for each CPU. */
91048 +#define ARCH_FREE_PTE_NR 5350  
91049 +
91050 +#endif
91051 +
91052 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
91053 +
91054 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
91055 +                                     unsigned long start, unsigned long end)
91056 +{
91057 +       /* x86_64 does not keep any page table caches in a software TLB.
91058 +          The CPUs do in their hardware TLBs, but they are handled
91059 +          by the normal TLB flushing algorithms. */
91060 +}
91061 +
91062 +#endif /* _X8664_TLBFLUSH_H */
91063 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/vga.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/vga.h
91064 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/vga.h  1970-01-01 00:00:00.000000000 +0000
91065 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/vga.h        2007-02-02 19:10:58.000000000 +0000
91066 @@ -0,0 +1,20 @@
91067 +/*
91068 + *     Access to VGA videoram
91069 + *
91070 + *     (c) 1998 Martin Mares <mj@ucw.cz>
91071 + */
91072 +
91073 +#ifndef _LINUX_ASM_VGA_H_
91074 +#define _LINUX_ASM_VGA_H_
91075 +
91076 +/*
91077 + *     On the PC, we can just recalculate addresses and then
91078 + *     access the videoram directly without any black magic.
91079 + */
91080 +
91081 +#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
91082 +
91083 +#define vga_readb(x) (*(x))
91084 +#define vga_writeb(x,y) (*(y) = (x))
91085 +
91086 +#endif
91087 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/xenoprof.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/xenoprof.h
91088 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/xenoprof.h     1970-01-01 00:00:00.000000000 +0000
91089 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/xenoprof.h   2007-02-02 19:10:58.000000000 +0000
91090 @@ -0,0 +1 @@
91091 +#include <asm-i386/mach-xen/asm/xenoprof.h>
91092 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/asm/xor.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/xor.h
91093 --- linux-2.6.19/include/asm-x86_64/mach-xen/asm/xor.h  1970-01-01 00:00:00.000000000 +0000
91094 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/asm/xor.h        2007-02-02 19:10:58.000000000 +0000
91095 @@ -0,0 +1,328 @@
91096 +/*
91097 + * x86-64 changes / gcc fixes from Andi Kleen. 
91098 + * Copyright 2002 Andi Kleen, SuSE Labs.
91099 + *
91100 + * This hasn't been optimized for the hammer yet, but there are likely
91101 + * no advantages to be gotten from x86-64 here anyways.
91102 + */
91103 +
91104 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
91105 +
91106 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to 
91107 +   tell it to do a clts before the register saving. */
91108 +#define XMMS_SAVE do {                         \
91109 +       preempt_disable();                      \
91110 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
91111 +               clts();                         \
91112 +       asm volatile (                          \
91113 +               "movups %%xmm0,(%1)     ;\n\t"  \
91114 +               "movups %%xmm1,0x10(%1) ;\n\t"  \
91115 +               "movups %%xmm2,0x20(%1) ;\n\t"  \
91116 +               "movups %%xmm3,0x30(%1) ;\n\t"  \
91117 +               : "=&r" (cr0)                   \
91118 +               : "r" (xmm_save)                \
91119 +               : "memory");                    \
91120 +} while(0)
91121 +
91122 +#define XMMS_RESTORE do {                      \
91123 +       asm volatile (                          \
91124 +               "sfence                 ;\n\t"  \
91125 +               "movups (%1),%%xmm0     ;\n\t"  \
91126 +               "movups 0x10(%1),%%xmm1 ;\n\t"  \
91127 +               "movups 0x20(%1),%%xmm2 ;\n\t"  \
91128 +               "movups 0x30(%1),%%xmm3 ;\n\t"  \
91129 +               :                               \
91130 +               : "r" (cr0), "r" (xmm_save)     \
91131 +               : "memory");                    \
91132 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
91133 +               stts();                         \
91134 +       preempt_enable();                       \
91135 +} while(0)
91136 +
91137 +#define OFFS(x)                "16*("#x")"
91138 +#define PF_OFFS(x)     "256+16*("#x")"
91139 +#define        PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
91140 +#define LD(x,y)                "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
91141 +#define ST(x,y)                "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
91142 +#define PF1(x)         "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
91143 +#define PF2(x)         "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
91144 +#define PF3(x)         "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
91145 +#define PF4(x)         "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
91146 +#define PF5(x)         "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
91147 +#define XO1(x,y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
91148 +#define XO2(x,y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
91149 +#define XO3(x,y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
91150 +#define XO4(x,y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
91151 +#define XO5(x,y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
91152 +
91153 +
91154 +static void
91155 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
91156 +{
91157 +        unsigned int lines = bytes >> 8;
91158 +       unsigned long cr0;
91159 +       xmm_store_t xmm_save[4];
91160 +
91161 +       XMMS_SAVE;
91162 +
91163 +        asm volatile (
91164 +#undef BLOCK
91165 +#define BLOCK(i) \
91166 +               LD(i,0)                                 \
91167 +                       LD(i+1,1)                       \
91168 +               PF1(i)                                  \
91169 +                               PF1(i+2)                \
91170 +                               LD(i+2,2)               \
91171 +                                       LD(i+3,3)       \
91172 +               PF0(i+4)                                \
91173 +                               PF0(i+6)                \
91174 +               XO1(i,0)                                \
91175 +                       XO1(i+1,1)                      \
91176 +                               XO1(i+2,2)              \
91177 +                                       XO1(i+3,3)      \
91178 +               ST(i,0)                                 \
91179 +                       ST(i+1,1)                       \
91180 +                               ST(i+2,2)               \
91181 +                                       ST(i+3,3)       \
91182 +
91183 +
91184 +               PF0(0)
91185 +                               PF0(2)
91186 +
91187 +       " .align 32                     ;\n"
91188 +        " 1:                            ;\n"
91189 +
91190 +               BLOCK(0)
91191 +               BLOCK(4)
91192 +               BLOCK(8)
91193 +               BLOCK(12)
91194 +
91195 +        "       addq %[inc], %[p1]           ;\n"
91196 +        "       addq %[inc], %[p2]           ;\n"
91197 +               "               decl %[cnt] ; jnz 1b"
91198 +       : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
91199 +       : [inc] "r" (256UL) 
91200 +        : "memory");
91201 +
91202 +       XMMS_RESTORE;
91203 +}
91204 +
91205 +static void
91206 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
91207 +         unsigned long *p3)
91208 +{
91209 +       unsigned int lines = bytes >> 8;
91210 +       xmm_store_t xmm_save[4];
91211 +       unsigned long cr0;
91212 +
91213 +       XMMS_SAVE;
91214 +
91215 +        __asm__ __volatile__ (
91216 +#undef BLOCK
91217 +#define BLOCK(i) \
91218 +               PF1(i)                                  \
91219 +                               PF1(i+2)                \
91220 +               LD(i,0)                                 \
91221 +                       LD(i+1,1)                       \
91222 +                               LD(i+2,2)               \
91223 +                                       LD(i+3,3)       \
91224 +               PF2(i)                                  \
91225 +                               PF2(i+2)                \
91226 +               PF0(i+4)                                \
91227 +                               PF0(i+6)                \
91228 +               XO1(i,0)                                \
91229 +                       XO1(i+1,1)                      \
91230 +                               XO1(i+2,2)              \
91231 +                                       XO1(i+3,3)      \
91232 +               XO2(i,0)                                \
91233 +                       XO2(i+1,1)                      \
91234 +                               XO2(i+2,2)              \
91235 +                                       XO2(i+3,3)      \
91236 +               ST(i,0)                                 \
91237 +                       ST(i+1,1)                       \
91238 +                               ST(i+2,2)               \
91239 +                                       ST(i+3,3)       \
91240 +
91241 +
91242 +               PF0(0)
91243 +                               PF0(2)
91244 +
91245 +       " .align 32                     ;\n"
91246 +        " 1:                            ;\n"
91247 +
91248 +               BLOCK(0)
91249 +               BLOCK(4)
91250 +               BLOCK(8)
91251 +               BLOCK(12)
91252 +
91253 +        "       addq %[inc], %[p1]           ;\n"
91254 +        "       addq %[inc], %[p2]          ;\n"
91255 +        "       addq %[inc], %[p3]           ;\n"
91256 +               "               decl %[cnt] ; jnz 1b"
91257 +       : [cnt] "+r" (lines),
91258 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
91259 +       : [inc] "r" (256UL)
91260 +       : "memory"); 
91261 +       XMMS_RESTORE;
91262 +}
91263 +
91264 +static void
91265 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
91266 +         unsigned long *p3, unsigned long *p4)
91267 +{
91268 +       unsigned int lines = bytes >> 8;
91269 +       xmm_store_t xmm_save[4]; 
91270 +       unsigned long cr0;
91271 +
91272 +       XMMS_SAVE;
91273 +
91274 +        __asm__ __volatile__ (
91275 +#undef BLOCK
91276 +#define BLOCK(i) \
91277 +               PF1(i)                                  \
91278 +                               PF1(i+2)                \
91279 +               LD(i,0)                                 \
91280 +                       LD(i+1,1)                       \
91281 +                               LD(i+2,2)               \
91282 +                                       LD(i+3,3)       \
91283 +               PF2(i)                                  \
91284 +                               PF2(i+2)                \
91285 +               XO1(i,0)                                \
91286 +                       XO1(i+1,1)                      \
91287 +                               XO1(i+2,2)              \
91288 +                                       XO1(i+3,3)      \
91289 +               PF3(i)                                  \
91290 +                               PF3(i+2)                \
91291 +               PF0(i+4)                                \
91292 +                               PF0(i+6)                \
91293 +               XO2(i,0)                                \
91294 +                       XO2(i+1,1)                      \
91295 +                               XO2(i+2,2)              \
91296 +                                       XO2(i+3,3)      \
91297 +               XO3(i,0)                                \
91298 +                       XO3(i+1,1)                      \
91299 +                               XO3(i+2,2)              \
91300 +                                       XO3(i+3,3)      \
91301 +               ST(i,0)                                 \
91302 +                       ST(i+1,1)                       \
91303 +                               ST(i+2,2)               \
91304 +                                       ST(i+3,3)       \
91305 +
91306 +
91307 +               PF0(0)
91308 +                               PF0(2)
91309 +
91310 +       " .align 32                     ;\n"
91311 +        " 1:                            ;\n"
91312 +
91313 +               BLOCK(0)
91314 +               BLOCK(4)
91315 +               BLOCK(8)
91316 +               BLOCK(12)
91317 +
91318 +        "       addq %[inc], %[p1]           ;\n"
91319 +        "       addq %[inc], %[p2]           ;\n"
91320 +        "       addq %[inc], %[p3]           ;\n"
91321 +        "       addq %[inc], %[p4]           ;\n"
91322 +       "       decl %[cnt] ; jnz 1b"
91323 +       : [cnt] "+c" (lines),
91324 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
91325 +       : [inc] "r" (256UL)
91326 +        : "memory" );
91327 +
91328 +       XMMS_RESTORE;
91329 +}
91330 +
91331 +static void
91332 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
91333 +         unsigned long *p3, unsigned long *p4, unsigned long *p5)
91334 +{
91335 +        unsigned int lines = bytes >> 8;
91336 +       xmm_store_t xmm_save[4];
91337 +       unsigned long cr0;
91338 +
91339 +       XMMS_SAVE;
91340 +
91341 +        __asm__ __volatile__ (
91342 +#undef BLOCK
91343 +#define BLOCK(i) \
91344 +               PF1(i)                                  \
91345 +                               PF1(i+2)                \
91346 +               LD(i,0)                                 \
91347 +                       LD(i+1,1)                       \
91348 +                               LD(i+2,2)               \
91349 +                                       LD(i+3,3)       \
91350 +               PF2(i)                                  \
91351 +                               PF2(i+2)                \
91352 +               XO1(i,0)                                \
91353 +                       XO1(i+1,1)                      \
91354 +                               XO1(i+2,2)              \
91355 +                                       XO1(i+3,3)      \
91356 +               PF3(i)                                  \
91357 +                               PF3(i+2)                \
91358 +               XO2(i,0)                                \
91359 +                       XO2(i+1,1)                      \
91360 +                               XO2(i+2,2)              \
91361 +                                       XO2(i+3,3)      \
91362 +               PF4(i)                                  \
91363 +                               PF4(i+2)                \
91364 +               PF0(i+4)                                \
91365 +                               PF0(i+6)                \
91366 +               XO3(i,0)                                \
91367 +                       XO3(i+1,1)                      \
91368 +                               XO3(i+2,2)              \
91369 +                                       XO3(i+3,3)      \
91370 +               XO4(i,0)                                \
91371 +                       XO4(i+1,1)                      \
91372 +                               XO4(i+2,2)              \
91373 +                                       XO4(i+3,3)      \
91374 +               ST(i,0)                                 \
91375 +                       ST(i+1,1)                       \
91376 +                               ST(i+2,2)               \
91377 +                                       ST(i+3,3)       \
91378 +
91379 +
91380 +               PF0(0)
91381 +                               PF0(2)
91382 +
91383 +       " .align 32                     ;\n"
91384 +        " 1:                            ;\n"
91385 +
91386 +               BLOCK(0)
91387 +               BLOCK(4)
91388 +               BLOCK(8)
91389 +               BLOCK(12)
91390 +
91391 +        "       addq %[inc], %[p1]           ;\n"
91392 +        "       addq %[inc], %[p2]           ;\n"
91393 +        "       addq %[inc], %[p3]           ;\n"
91394 +        "       addq %[inc], %[p4]           ;\n"
91395 +        "       addq %[inc], %[p5]           ;\n"
91396 +       "       decl %[cnt] ; jnz 1b"
91397 +       : [cnt] "+c" (lines),
91398 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 
91399 +         [p5] "+r" (p5)
91400 +       : [inc] "r" (256UL)
91401 +       : "memory");
91402 +
91403 +       XMMS_RESTORE;
91404 +}
91405 +
91406 +static struct xor_block_template xor_block_sse = {
91407 +        .name = "generic_sse",
91408 +        .do_2 = xor_sse_2,
91409 +        .do_3 = xor_sse_3,
91410 +        .do_4 = xor_sse_4,
91411 +        .do_5 = xor_sse_5,
91412 +};
91413 +
91414 +#undef XOR_TRY_TEMPLATES
91415 +#define XOR_TRY_TEMPLATES                              \
91416 +       do {                                            \
91417 +               xor_speed(&xor_block_sse);      \
91418 +       } while (0)
91419 +
91420 +/* We force the use of the SSE xor block because it can write around L2.
91421 +   We may also be able to load into the L1 only depending on how the cpu
91422 +   deals with a load to a line that is being prefetched.  */
91423 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
91424 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/irq_vectors.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/irq_vectors.h
91425 --- linux-2.6.19/include/asm-x86_64/mach-xen/irq_vectors.h      1970-01-01 00:00:00.000000000 +0000
91426 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/irq_vectors.h    2007-02-02 19:10:58.000000000 +0000
91427 @@ -0,0 +1,123 @@
91428 +/*
91429 + * This file should contain #defines for all of the interrupt vector
91430 + * numbers used by this architecture.
91431 + *
91432 + * In addition, there are some standard defines:
91433 + *
91434 + *     FIRST_EXTERNAL_VECTOR:
91435 + *             The first free place for external interrupts
91436 + *
91437 + *     SYSCALL_VECTOR:
91438 + *             The IRQ vector a syscall makes the user to kernel transition
91439 + *             under.
91440 + *
91441 + *     TIMER_IRQ:
91442 + *             The IRQ number the timer interrupt comes in at.
91443 + *
91444 + *     NR_IRQS:
91445 + *             The total number of interrupt vectors (including all the
91446 + *             architecture specific interrupts) needed.
91447 + *
91448 + */                    
91449 +#ifndef _ASM_IRQ_VECTORS_H
91450 +#define _ASM_IRQ_VECTORS_H
91451 +
91452 +/*
91453 + * IDT vectors usable for external interrupt sources start
91454 + * at 0x20:
91455 + */
91456 +#define FIRST_EXTERNAL_VECTOR  0x20
91457 +
91458 +#define SYSCALL_VECTOR         0x80
91459 +
91460 +/*
91461 + * Vectors 0x20-0x2f are used for ISA interrupts.
91462 + */
91463 +
91464 +#if 0
91465 +/*
91466 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
91467 + *
91468 + *  some of the following vectors are 'rare', they are merged
91469 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
91470 + *  TLB, reschedule and local APIC vectors are performance-critical.
91471 + *
91472 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
91473 + */
91474 +#define INVALIDATE_TLB_VECTOR  0xfd
91475 +#define RESCHEDULE_VECTOR      0xfc
91476 +#define CALL_FUNCTION_VECTOR   0xfb
91477 +
91478 +#define THERMAL_APIC_VECTOR    0xf0
91479 +/*
91480 + * Local APIC timer IRQ vector is on a different priority level,
91481 + * to work around the 'lost local interrupt if more than 2 IRQ
91482 + * sources per level' errata.
91483 + */
91484 +#define LOCAL_TIMER_VECTOR     0xef
91485 +#endif
91486 +
91487 +#define SPURIOUS_APIC_VECTOR   0xff
91488 +#define ERROR_APIC_VECTOR      0xfe
91489 +
91490 +/*
91491 + * First APIC vector available to drivers: (vectors 0x30-0xee)
91492 + * we start at 0x31 to spread out vectors evenly between priority
91493 + * levels. (0x80 is the syscall vector)
91494 + */
91495 +#define FIRST_DEVICE_VECTOR    0x31
91496 +#define FIRST_SYSTEM_VECTOR    0xef
91497 +
91498 +/*
91499 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
91500 + * Right now the APIC is mostly only used for SMP.
91501 + * 256 vectors is an architectural limit. (we can have
91502 + * more than 256 devices theoretically, but they will
91503 + * have to use shared interrupts)
91504 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
91505 + * the usable vector space is 0x20-0xff (224 vectors)
91506 + */
91507 +
91508 +#define RESCHEDULE_VECTOR      0
91509 +#define CALL_FUNCTION_VECTOR   1
91510 +#define NR_IPIS                        2
91511 +
91512 +/*
91513 + * The maximum number of vectors supported by i386 processors
91514 + * is limited to 256. For processors other than i386, NR_VECTORS
91515 + * should be changed accordingly.
91516 + */
91517 +#define NR_VECTORS 256
91518 +
91519 +#define FPU_IRQ                        13
91520 +
91521 +#define        FIRST_VM86_IRQ          3
91522 +#define LAST_VM86_IRQ          15
91523 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
91524 +
91525 +/*
91526 + * The flat IRQ space is divided into two regions:
91527 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
91528 + *     if we have physical device-access privilege. This region is at the 
91529 + *     start of the IRQ space so that existing device drivers do not need
91530 + *     to be modified to translate physical IRQ numbers into our IRQ space.
91531 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
91532 + *     are bound using the provided bind/unbind functions.
91533 + */
91534 +
91535 +#define PIRQ_BASE              0
91536 +#define NR_PIRQS               256
91537 +
91538 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
91539 +#define NR_DYNIRQS             256
91540 +
91541 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
91542 +#define NR_IRQ_VECTORS         NR_IRQS
91543 +
91544 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
91545 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
91546 +
91547 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
91548 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
91549 +
91550 +#endif /* _ASM_IRQ_VECTORS_H */
91551 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/mach_time.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/mach_time.h
91552 --- linux-2.6.19/include/asm-x86_64/mach-xen/mach_time.h        1970-01-01 00:00:00.000000000 +0000
91553 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/mach_time.h      2007-02-02 19:10:58.000000000 +0000
91554 @@ -0,0 +1,111 @@
91555 +/*
91556 + *  include/asm-i386/mach-default/mach_time.h
91557 + *
91558 + *  Machine specific set RTC function for generic.
91559 + *  Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
91560 + */
91561 +#ifndef _MACH_TIME_H
91562 +#define _MACH_TIME_H
91563 +
91564 +#include <asm-i386/mc146818rtc.h>
91565 +
91566 +/* for check timing call set_rtc_mmss() 500ms     */
91567 +/* used in arch/i386/time.c::do_timer_interrupt() */
91568 +#define USEC_AFTER     500000
91569 +#define USEC_BEFORE    500000
91570 +
91571 +/*
91572 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
91573 + * called 500 ms after the second nowtime has started, because when
91574 + * nowtime is written into the registers of the CMOS clock, it will
91575 + * jump to the next second precisely 500 ms later. Check the Motorola
91576 + * MC146818A or Dallas DS12887 data sheet for details.
91577 + *
91578 + * BUG: This routine does not handle hour overflow properly; it just
91579 + *      sets the minutes. Usually you'll only notice that after reboot!
91580 + */
91581 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
91582 +{
91583 +       int retval = 0;
91584 +       int real_seconds, real_minutes, cmos_minutes;
91585 +       unsigned char save_control, save_freq_select;
91586 +
91587 +       save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
91588 +       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
91589 +
91590 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
91591 +       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
91592 +
91593 +       cmos_minutes = CMOS_READ(RTC_MINUTES);
91594 +       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
91595 +               BCD_TO_BIN(cmos_minutes);
91596 +
91597 +       /*
91598 +        * since we're only adjusting minutes and seconds,
91599 +        * don't interfere with hour overflow. This avoids
91600 +        * messing with unknown time zones but requires your
91601 +        * RTC not to be off by more than 15 minutes
91602 +        */
91603 +       real_seconds = nowtime % 60;
91604 +       real_minutes = nowtime / 60;
91605 +       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
91606 +               real_minutes += 30;             /* correct for half hour time zone */
91607 +       real_minutes %= 60;
91608 +
91609 +       if (abs(real_minutes - cmos_minutes) < 30) {
91610 +               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
91611 +                       BIN_TO_BCD(real_seconds);
91612 +                       BIN_TO_BCD(real_minutes);
91613 +               }
91614 +               CMOS_WRITE(real_seconds,RTC_SECONDS);
91615 +               CMOS_WRITE(real_minutes,RTC_MINUTES);
91616 +       } else {
91617 +               printk(KERN_WARNING
91618 +                      "set_rtc_mmss: can't update from %d to %d\n",
91619 +                      cmos_minutes, real_minutes);
91620 +               retval = -1;
91621 +       }
91622 +
91623 +       /* The following flags have to be released exactly in this order,
91624 +        * otherwise the DS12887 (popular MC146818A clone with integrated
91625 +        * battery and quartz) will not reset the oscillator and will not
91626 +        * update precisely 500 ms later. You won't find this mentioned in
91627 +        * the Dallas Semiconductor data sheets, but who believes data
91628 +        * sheets anyway ...                           -- Markus Kuhn
91629 +        */
91630 +       CMOS_WRITE(save_control, RTC_CONTROL);
91631 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
91632 +
91633 +       return retval;
91634 +}
91635 +
91636 +static inline unsigned long mach_get_cmos_time(void)
91637 +{
91638 +       unsigned int year, mon, day, hour, min, sec;
91639 +
91640 +       do {
91641 +               sec = CMOS_READ(RTC_SECONDS);
91642 +               min = CMOS_READ(RTC_MINUTES);
91643 +               hour = CMOS_READ(RTC_HOURS);
91644 +               day = CMOS_READ(RTC_DAY_OF_MONTH);
91645 +               mon = CMOS_READ(RTC_MONTH);
91646 +               year = CMOS_READ(RTC_YEAR);
91647 +       } while (sec != CMOS_READ(RTC_SECONDS));
91648 +
91649 +       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
91650 +               BCD_TO_BIN(sec);
91651 +               BCD_TO_BIN(min);
91652 +               BCD_TO_BIN(hour);
91653 +               BCD_TO_BIN(day);
91654 +               BCD_TO_BIN(mon);
91655 +               BCD_TO_BIN(year);
91656 +       }
91657 +
91658 +       year += 1900;
91659 +       if (year < 1970)
91660 +               year += 100;
91661 +
91662 +       return mktime(year, mon, day, hour, min, sec);
91663 +}
91664 +
91665 +#endif /* !_MACH_TIME_H */
91666 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/mach_timer.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/mach_timer.h
91667 --- linux-2.6.19/include/asm-x86_64/mach-xen/mach_timer.h       1970-01-01 00:00:00.000000000 +0000
91668 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/mach_timer.h     2007-02-02 19:10:58.000000000 +0000
91669 @@ -0,0 +1,48 @@
91670 +/*
91671 + *  include/asm-i386/mach-default/mach_timer.h
91672 + *
91673 + *  Machine specific calibrate_tsc() for generic.
91674 + *  Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
91675 + */
91676 +/* ------ Calibrate the TSC ------- 
91677 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
91678 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
91679 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
91680 + * output busy loop as low as possible. We avoid reading the CTC registers
91681 + * directly because of the awkward 8-bit access mechanism of the 82C54
91682 + * device.
91683 + */
91684 +#ifndef _MACH_TIMER_H
91685 +#define _MACH_TIMER_H
91686 +
91687 +#define CALIBRATE_LATCH        (5 * LATCH)
91688 +
91689 +static inline void mach_prepare_counter(void)
91690 +{
91691 +       /* Set the Gate high, disable speaker */
91692 +       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
91693 +
91694 +       /*
91695 +        * Now let's take care of CTC channel 2
91696 +        *
91697 +        * Set the Gate high, program CTC channel 2 for mode 0,
91698 +        * (interrupt on terminal count mode), binary count,
91699 +        * load 5 * LATCH count, (LSB and MSB) to begin countdown.
91700 +        *
91701 +        * Some devices need a delay here.
91702 +        */
91703 +       outb(0xb0, 0x43);                       /* binary, mode 0, LSB/MSB, Ch 2 */
91704 +       outb_p(CALIBRATE_LATCH & 0xff, 0x42);   /* LSB of count */
91705 +       outb_p(CALIBRATE_LATCH >> 8, 0x42);       /* MSB of count */
91706 +}
91707 +
91708 +static inline void mach_countup(unsigned long *count_p)
91709 +{
91710 +       unsigned long count = 0;
91711 +       do {
91712 +               count++;
91713 +       } while ((inb_p(0x61) & 0x20) == 0);
91714 +       *count_p = count;
91715 +}
91716 +
91717 +#endif /* !_MACH_TIMER_H */
91718 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/setup_arch_post.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/setup_arch_post.h
91719 --- linux-2.6.19/include/asm-x86_64/mach-xen/setup_arch_post.h  1970-01-01 00:00:00.000000000 +0000
91720 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/setup_arch_post.h        2007-02-02 19:10:58.000000000 +0000
91721 @@ -0,0 +1,63 @@
91722 +/**
91723 + * machine_specific_* - Hooks for machine specific setup.
91724 + *
91725 + * Description:
91726 + *     This is included late in kernel/setup.c so that it can make
91727 + *     use of all of the static functions.
91728 + **/
91729 +
91730 +#include <xen/interface/callback.h>
91731 +
91732 +extern void hypervisor_callback(void);
91733 +extern void failsafe_callback(void);
91734 +extern void nmi(void);
91735 +
91736 +static void __init machine_specific_arch_setup(void)
91737 +{
91738 +       int ret;
91739 +       static struct callback_register __initdata event = {
91740 +               .type = CALLBACKTYPE_event,
91741 +               .address = (unsigned long) hypervisor_callback,
91742 +       };
91743 +       static struct callback_register __initdata failsafe = {
91744 +               .type = CALLBACKTYPE_failsafe,
91745 +               .address = (unsigned long)failsafe_callback,
91746 +       };
91747 +       static struct callback_register __initdata syscall = {
91748 +               .type = CALLBACKTYPE_syscall,
91749 +               .address = (unsigned long)system_call,
91750 +       };
91751 +#ifdef CONFIG_X86_LOCAL_APIC
91752 +       static struct callback_register __initdata nmi_cb = {
91753 +               .type = CALLBACKTYPE_nmi,
91754 +               .address = (unsigned long)nmi,
91755 +       };
91756 +#endif
91757 +
91758 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
91759 +       if (ret == 0)
91760 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
91761 +       if (ret == 0)
91762 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
91763 +#ifdef CONFIG_XEN_COMPAT_030002
91764 +       if (ret == -ENOSYS)
91765 +               ret = HYPERVISOR_set_callbacks(
91766 +                       event.address,
91767 +                       failsafe.address,
91768 +                       syscall.address);
91769 +#endif
91770 +       BUG_ON(ret);
91771 +
91772 +#ifdef CONFIG_X86_LOCAL_APIC
91773 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
91774 +#ifdef CONFIG_XEN_COMPAT_030002
91775 +       if (ret == -ENOSYS) {
91776 +               static struct xennmi_callback __initdata cb = {
91777 +                       .handler_address = (unsigned long)nmi
91778 +               };
91779 +
91780 +               HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
91781 +       }
91782 +#endif
91783 +#endif
91784 +}
91785 diff -ruNp linux-2.6.19/include/asm-x86_64/mach-xen/setup_arch_pre.h linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/setup_arch_pre.h
91786 --- linux-2.6.19/include/asm-x86_64/mach-xen/setup_arch_pre.h   1970-01-01 00:00:00.000000000 +0000
91787 +++ linux-2.6.19-xen-3.0.4/include/asm-x86_64/mach-xen/setup_arch_pre.h 2007-02-02 19:10:58.000000000 +0000
91788 @@ -0,0 +1,5 @@
91789 +/* Hook to call BIOS initialisation function */
91790 +
91791 +#define ARCH_SETUP machine_specific_arch_setup();
91792 +
91793 +static void __init machine_specific_arch_setup(void);
91794 diff -ruNp linux-2.6.19/include/linux/gfp.h linux-2.6.19-xen-3.0.4/include/linux/gfp.h
91795 --- linux-2.6.19/include/linux/gfp.h    2006-11-29 21:57:37.000000000 +0000
91796 +++ linux-2.6.19-xen-3.0.4/include/linux/gfp.h  2007-02-02 19:10:59.000000000 +0000
91797 @@ -114,7 +114,11 @@ static inline enum zone_type gfp_zone(gf
91798   */
91799  
91800  #ifndef HAVE_ARCH_FREE_PAGE
91801 -static inline void arch_free_page(struct page *page, int order) { }
91802 +/*
91803 + * If arch_free_page returns non-zero then the generic free_page code can
91804 + * immediately bail: the arch-specific function has done all the work.
91805 + */
91806 +static inline int arch_free_page(struct page *page, int order) { return 0; }
91807  #endif
91808  
91809  extern struct page *
91810 diff -ruNp linux-2.6.19/include/linux/highmem.h linux-2.6.19-xen-3.0.4/include/linux/highmem.h
91811 --- linux-2.6.19/include/linux/highmem.h        2006-11-29 21:57:37.000000000 +0000
91812 +++ linux-2.6.19-xen-3.0.4/include/linux/highmem.h      2007-02-02 19:10:59.000000000 +0000
91813 @@ -25,10 +25,16 @@ static inline void flush_kernel_dcache_p
91814  /* declarations for linux/mm/highmem.c */
91815  unsigned int nr_free_highpages(void);
91816  extern unsigned long totalhigh_pages;
91817 +#ifdef CONFIG_XEN
91818 +void kmap_flush_unused(void);
91819 +#endif
91820  
91821  #else /* CONFIG_HIGHMEM */
91822  
91823  static inline unsigned int nr_free_highpages(void) { return 0; }
91824 +#ifdef CONFIG_XEN
91825 +static inline void kmap_flush_unused(void) { }
91826 +#endif
91827  
91828  #define totalhigh_pages 0
91829  
91830 diff -ruNp linux-2.6.19/include/linux/interrupt.h linux-2.6.19-xen-3.0.4/include/linux/interrupt.h
91831 --- linux-2.6.19/include/linux/interrupt.h      2007-02-02 20:26:43.000000000 +0000
91832 +++ linux-2.6.19-xen-3.0.4/include/linux/interrupt.h    2007-02-02 19:10:59.000000000 +0000
91833 @@ -184,6 +184,12 @@ static inline int disable_irq_wake(unsig
91834  
91835  #endif /* CONFIG_GENERIC_HARDIRQS */
91836  
91837 +#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
91838 +int irq_ignore_unhandled(unsigned int irq);
91839 +#else
91840 +#define irq_ignore_unhandled(irq) 0
91841 +#endif
91842 +
91843  #ifndef __ARCH_SET_SOFTIRQ_PENDING
91844  #define set_softirq_pending(x) (local_softirq_pending() = (x))
91845  #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
91846 diff -ruNp linux-2.6.19/include/linux/kexec.h linux-2.6.19-xen-3.0.4/include/linux/kexec.h
91847 --- linux-2.6.19/include/linux/kexec.h  2006-11-29 21:57:37.000000000 +0000
91848 +++ linux-2.6.19-xen-3.0.4/include/linux/kexec.h        2007-02-02 19:10:59.000000000 +0000
91849 @@ -31,6 +31,13 @@
91850  #error KEXEC_ARCH not defined
91851  #endif
91852  
91853 +#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
91854 +#define kexec_page_to_pfn(page)  page_to_pfn(page)
91855 +#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
91856 +#define kexec_virt_to_phys(addr) virt_to_phys(addr)
91857 +#define kexec_phys_to_virt(addr) phys_to_virt(addr)
91858 +#endif
91859 +
91860  /*
91861   * This structure is used to hold the arguments that are used when loading
91862   * kernel binaries.
91863 @@ -91,6 +98,12 @@ struct kimage {
91864  extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
91865  extern int machine_kexec_prepare(struct kimage *image);
91866  extern void machine_kexec_cleanup(struct kimage *image);
91867 +#ifdef CONFIG_XEN
91868 +extern int xen_machine_kexec_load(struct kimage *image);
91869 +extern void xen_machine_kexec_unload(struct kimage *image);
91870 +extern void xen_machine_kexec_setup_resources(void);
91871 +extern void xen_machine_kexec_register_resources(struct resource *res);
91872 +#endif
91873  extern asmlinkage long sys_kexec_load(unsigned long entry,
91874                                         unsigned long nr_segments,
91875                                         struct kexec_segment __user *segments,
91876 diff -ruNp linux-2.6.19/include/linux/mm.h linux-2.6.19-xen-3.0.4/include/linux/mm.h
91877 --- linux-2.6.19/include/linux/mm.h     2006-11-29 21:57:37.000000000 +0000
91878 +++ linux-2.6.19-xen-3.0.4/include/linux/mm.h   2007-02-02 19:10:59.000000000 +0000
91879 @@ -166,6 +166,9 @@ extern unsigned int kobjsize(const void 
91880  #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
91881  #define VM_MAPPED_COPY 0x01000000      /* T if mapped copy of data (nommu mmap) */
91882  #define VM_INSERTPAGE  0x02000000      /* The vma has had "vm_insert_page()" done on it */
91883 +#ifdef CONFIG_XEN
91884 +#define VM_FOREIGN     0x04000000      /* Has pages belonging to another VM */
91885 +#endif
91886  
91887  #ifndef VM_STACK_DEFAULT_FLAGS         /* arch can override this */
91888  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
91889 @@ -1092,6 +1095,13 @@ struct page *follow_page(struct vm_area_
91890  #define FOLL_GET       0x04    /* do get_page on page */
91891  #define FOLL_ANON      0x08    /* give ZERO_PAGE if no pgtable */
91892  
91893 +#ifdef CONFIG_XEN
91894 +typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
91895 +                       void *data);
91896 +extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
91897 +                              unsigned long size, pte_fn_t fn, void *data);
91898 +#endif
91899 +
91900  #ifdef CONFIG_PROC_FS
91901  void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
91902  #else
91903 diff -ruNp linux-2.6.19/include/linux/oprofile.h linux-2.6.19-xen-3.0.4/include/linux/oprofile.h
91904 --- linux-2.6.19/include/linux/oprofile.h       2006-11-29 21:57:37.000000000 +0000
91905 +++ linux-2.6.19-xen-3.0.4/include/linux/oprofile.h     2007-02-02 19:10:59.000000000 +0000
91906 @@ -16,6 +16,10 @@
91907  #include <linux/types.h>
91908  #include <linux/spinlock.h>
91909  #include <asm/atomic.h>
91910 +
91911 +#ifdef CONFIG_XEN
91912 +#include <xen/interface/xenoprof.h>
91913 +#endif
91914   
91915  struct super_block;
91916  struct dentry;
91917 @@ -27,6 +31,11 @@ struct oprofile_operations {
91918         /* create any necessary configuration files in the oprofile fs.
91919          * Optional. */
91920         int (*create_files)(struct super_block * sb, struct dentry * root);
91921 +       /* setup active domains with Xen */
91922 +       int (*set_active)(int *active_domains, unsigned int adomains);
91923 +        /* setup passive domains with Xen */
91924 +        int (*set_passive)(int *passive_domains, unsigned int pdomains);
91925 +       
91926         /* Do any necessary interrupt setup. Optional. */
91927         int (*setup)(void);
91928         /* Do any necessary interrupt shutdown. Optional. */
91929 @@ -78,6 +87,8 @@ void oprofile_add_pc(unsigned long pc, i
91930  /* add a backtrace entry, to be called from the ->backtrace callback */
91931  void oprofile_add_trace(unsigned long eip);
91932  
91933 +/* add a domain switch entry */
91934 +int oprofile_add_domain_switch(int32_t domain_id);
91935  
91936  /**
91937   * Create a file of the given name as a child of the given root, with
91938 diff -ruNp linux-2.6.19/include/linux/pfn.h linux-2.6.19-xen-3.0.4/include/linux/pfn.h
91939 --- linux-2.6.19/include/linux/pfn.h    2006-11-29 21:57:37.000000000 +0000
91940 +++ linux-2.6.19-xen-3.0.4/include/linux/pfn.h  2007-02-02 19:10:59.000000000 +0000
91941 @@ -4,6 +4,10 @@
91942  #define PFN_ALIGN(x)   (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
91943  #define PFN_UP(x)      (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
91944  #define PFN_DOWN(x)    ((x) >> PAGE_SHIFT)
91945 +#if defined(CONFIG_X86_XEN) && defined(CONFIG_X86_PAE)
91946 +#define PFN_PHYS(x)    ((unsigned long long)(x) << PAGE_SHIFT)
91947 +#else
91948  #define PFN_PHYS(x)    ((x) << PAGE_SHIFT)
91949 +#endif
91950  
91951  #endif
91952 diff -ruNp linux-2.6.19/include/linux/skbuff.h linux-2.6.19-xen-3.0.4/include/linux/skbuff.h
91953 --- linux-2.6.19/include/linux/skbuff.h 2006-11-29 21:57:37.000000000 +0000
91954 +++ linux-2.6.19-xen-3.0.4/include/linux/skbuff.h       2007-02-02 19:10:59.000000000 +0000
91955 @@ -204,6 +204,8 @@ enum {
91956   *     @local_df: allow local fragmentation
91957   *     @cloned: Head may be cloned (check refcnt to be sure)
91958   *     @nohdr: Payload reference only, must not modify header
91959 + *     @proto_data_valid: Protocol data validated since arriving at localhost
91960 + *     @proto_csum_blank: Protocol csum must be added before leaving localhost
91961   *     @pkt_type: Packet class
91962   *     @fclone: skbuff clone status
91963   *     @ip_summed: Driver fed us an IP checksum
91964 @@ -283,7 +285,13 @@ struct sk_buff {
91965                                 nfctinfo:3;
91966         __u8                    pkt_type:3,
91967                                 fclone:2,
91968 +#ifndef CONFIG_XEN
91969                                 ipvs_property:1;
91970 +#else
91971 +                               ipvs_property:1,
91972 +                               proto_data_valid:1,
91973 +                               proto_csum_blank:1;
91974 +#endif
91975         __be16                  protocol;
91976  
91977         void                    (*destructor)(struct sk_buff *skb);
91978 @@ -346,7 +354,8 @@ static inline struct sk_buff *alloc_skb_
91979  
91980  extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
91981                                             unsigned int size,
91982 -                                           gfp_t priority);
91983 +                                           gfp_t priority,
91984 +                                           int fclone);
91985  extern void           kfree_skbmem(struct sk_buff *skb);
91986  extern struct sk_buff *skb_clone(struct sk_buff *skb,
91987                                  gfp_t priority);
91988 @@ -1087,6 +1096,7 @@ static inline void __skb_queue_purge(str
91989                 kfree_skb(skb);
91990  }
91991  
91992 +#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB
91993  /**
91994   *     __dev_alloc_skb - allocate an skbuff for receiving
91995   *     @length: length to allocate
91996 @@ -1107,6 +1117,9 @@ static inline struct sk_buff *__dev_allo
91997                 skb_reserve(skb, NET_SKB_PAD);
91998         return skb;
91999  }
92000 +#else
92001 +extern struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask);
92002 +#endif
92003  
92004  /**
92005   *     dev_alloc_skb - allocate an skbuff for receiving
92006 diff -ruNp linux-2.6.19/include/xen/balloon.h linux-2.6.19-xen-3.0.4/include/xen/balloon.h
92007 --- linux-2.6.19/include/xen/balloon.h  1970-01-01 00:00:00.000000000 +0000
92008 +++ linux-2.6.19-xen-3.0.4/include/xen/balloon.h        2007-02-02 19:11:00.000000000 +0000
92009 @@ -0,0 +1,57 @@
92010 +/******************************************************************************
92011 + * balloon.h
92012 + *
92013 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
92014 + *
92015 + * Copyright (c) 2003, B Dragovic
92016 + * Copyright (c) 2003-2004, M Williamson, K Fraser
92017 + * 
92018 + * This program is free software; you can redistribute it and/or
92019 + * modify it under the terms of the GNU General Public License version 2
92020 + * as published by the Free Software Foundation; or, when distributed
92021 + * separately from the Linux kernel or incorporated into other
92022 + * software packages, subject to the following license:
92023 + * 
92024 + * Permission is hereby granted, free of charge, to any person obtaining a copy
92025 + * of this source file (the "Software"), to deal in the Software without
92026 + * restriction, including without limitation the rights to use, copy, modify,
92027 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
92028 + * and to permit persons to whom the Software is furnished to do so, subject to
92029 + * the following conditions:
92030 + * 
92031 + * The above copyright notice and this permission notice shall be included in
92032 + * all copies or substantial portions of the Software.
92033 + * 
92034 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92035 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
92036 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
92037 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
92038 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92039 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
92040 + * IN THE SOFTWARE.
92041 + */
92042 +
92043 +#ifndef __ASM_BALLOON_H__
92044 +#define __ASM_BALLOON_H__
92045 +
92046 +/*
92047 + * Inform the balloon driver that it should allow some slop for device-driver
92048 + * memory activities.
92049 + */
92050 +void balloon_update_driver_allowance(long delta);
92051 +
92052 +/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
92053 +struct page **alloc_empty_pages_and_pagevec(int nr_pages);
92054 +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
92055 +
92056 +void balloon_release_driver_page(struct page *page);
92057 +
92058 +/*
92059 + * Prevent the balloon driver from changing the memory reservation during
92060 + * a driver critical region.
92061 + */
92062 +extern spinlock_t balloon_lock;
92063 +#define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
92064 +#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
92065 +
92066 +#endif /* __ASM_BALLOON_H__ */
92067 diff -ruNp linux-2.6.19/include/xen/cpu_hotplug.h linux-2.6.19-xen-3.0.4/include/xen/cpu_hotplug.h
92068 --- linux-2.6.19/include/xen/cpu_hotplug.h      1970-01-01 00:00:00.000000000 +0000
92069 +++ linux-2.6.19-xen-3.0.4/include/xen/cpu_hotplug.h    2007-02-02 19:11:00.000000000 +0000
92070 @@ -0,0 +1,43 @@
92071 +#ifndef __XEN_CPU_HOTPLUG_H__
92072 +#define __XEN_CPU_HOTPLUG_H__
92073 +
92074 +#include <linux/kernel.h>
92075 +#include <linux/cpumask.h>
92076 +
92077 +#if defined(CONFIG_HOTPLUG_CPU)
92078 +
92079 +#if defined(CONFIG_X86)
92080 +void cpu_initialize_context(unsigned int cpu);
92081 +#else
92082 +#define cpu_initialize_context(cpu)    ((void)0)
92083 +#endif
92084 +
92085 +int cpu_up_check(unsigned int cpu);
92086 +void init_xenbus_allowed_cpumask(void);
92087 +int smp_suspend(void);
92088 +void smp_resume(void);
92089 +
92090 +void cpu_bringup(void);
92091 +
92092 +#else /* !defined(CONFIG_HOTPLUG_CPU) */
92093 +
92094 +#define cpu_up_check(cpu)              (0)
92095 +#define init_xenbus_allowed_cpumask()  ((void)0)
92096 +
92097 +static inline int smp_suspend(void)
92098 +{
92099 +       if (num_online_cpus() > 1) {
92100 +               printk(KERN_WARNING "Can't suspend SMP guests "
92101 +                      "without CONFIG_HOTPLUG_CPU\n");
92102 +               return -EOPNOTSUPP;
92103 +       }
92104 +       return 0;
92105 +}
92106 +
92107 +static inline void smp_resume(void)
92108 +{
92109 +}
92110 +
92111 +#endif /* !defined(CONFIG_HOTPLUG_CPU) */
92112 +
92113 +#endif /* __XEN_CPU_HOTPLUG_H__ */
92114 diff -ruNp linux-2.6.19/include/xen/driver_util.h linux-2.6.19-xen-3.0.4/include/xen/driver_util.h
92115 --- linux-2.6.19/include/xen/driver_util.h      1970-01-01 00:00:00.000000000 +0000
92116 +++ linux-2.6.19-xen-3.0.4/include/xen/driver_util.h    2007-02-02 19:11:00.000000000 +0000
92117 @@ -0,0 +1,15 @@
92118 +
92119 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
92120 +#define __ASM_XEN_DRIVER_UTIL_H__
92121 +
92122 +#include <linux/vmalloc.h>
92123 +
92124 +/* Allocate/destroy a 'vmalloc' VM area. */
92125 +extern struct vm_struct *alloc_vm_area(unsigned long size);
92126 +extern void free_vm_area(struct vm_struct *area);
92127 +
92128 +/* Lock an area so that PTEs are accessible in the current address space. */
92129 +extern void lock_vm_area(struct vm_struct *area);
92130 +extern void unlock_vm_area(struct vm_struct *area);
92131 +
92132 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
92133 diff -ruNp linux-2.6.19/include/xen/evtchn.h linux-2.6.19-xen-3.0.4/include/xen/evtchn.h
92134 --- linux-2.6.19/include/xen/evtchn.h   1970-01-01 00:00:00.000000000 +0000
92135 +++ linux-2.6.19-xen-3.0.4/include/xen/evtchn.h 2007-02-02 19:11:00.000000000 +0000
92136 @@ -0,0 +1,113 @@
92137 +/******************************************************************************
92138 + * evtchn.h
92139 + * 
92140 + * Communication via Xen event channels.
92141 + * Also definitions for the device that demuxes notifications to userspace.
92142 + * 
92143 + * Copyright (c) 2004-2005, K A Fraser
92144 + * 
92145 + * This program is free software; you can redistribute it and/or
92146 + * modify it under the terms of the GNU General Public License version 2
92147 + * as published by the Free Software Foundation; or, when distributed
92148 + * separately from the Linux kernel or incorporated into other
92149 + * software packages, subject to the following license:
92150 + * 
92151 + * Permission is hereby granted, free of charge, to any person obtaining a copy
92152 + * of this source file (the "Software"), to deal in the Software without
92153 + * restriction, including without limitation the rights to use, copy, modify,
92154 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
92155 + * and to permit persons to whom the Software is furnished to do so, subject to
92156 + * the following conditions:
92157 + * 
92158 + * The above copyright notice and this permission notice shall be included in
92159 + * all copies or substantial portions of the Software.
92160 + * 
92161 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92162 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
92163 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
92164 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
92165 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92166 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
92167 + * IN THE SOFTWARE.
92168 + */
92169 +
92170 +#ifndef __ASM_EVTCHN_H__
92171 +#define __ASM_EVTCHN_H__
92172 +
92173 +#include <linux/interrupt.h>
92174 +#include <asm/hypervisor.h>
92175 +#include <asm/ptrace.h>
92176 +#include <asm/synch_bitops.h>
92177 +#include <xen/interface/event_channel.h>
92178 +#include <linux/smp.h>
92179 +
92180 +/*
92181 + * LOW-LEVEL DEFINITIONS
92182 + */
92183 +
92184 +/*
92185 + * Dynamically bind an event source to an IRQ-like callback handler.
92186 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
92187 + * The IRQ argument passed to the callback handler is the same as returned
92188 + * from the bind call. It may not correspond to a Linux IRQ number.
92189 + * Returns IRQ or negative errno.
92190 + * UNBIND: Takes IRQ to unbind from; automatically closes the event channel.
92191 + */
92192 +extern int bind_evtchn_to_irqhandler(
92193 +       unsigned int evtchn,
92194 +       irq_handler_t handler,
92195 +       unsigned long irqflags,
92196 +       const char *devname,
92197 +       void *dev_id);
92198 +extern int bind_virq_to_irqhandler(
92199 +       unsigned int virq,
92200 +       unsigned int cpu,
92201 +       irq_handler_t handler,
92202 +       unsigned long irqflags,
92203 +       const char *devname,
92204 +       void *dev_id);
92205 +extern int bind_ipi_to_irqhandler(
92206 +       unsigned int ipi,
92207 +       unsigned int cpu,
92208 +       irq_handler_t handler,
92209 +       unsigned long irqflags,
92210 +       const char *devname,
92211 +       void *dev_id);
92212 +
92213 +/*
92214 + * Common unbind function for all event sources. Takes IRQ to unbind from.
92215 + * Automatically closes the underlying event channel (even for bindings
92216 + * made with bind_evtchn_to_irqhandler()).
92217 + */
92218 +extern void unbind_from_irqhandler(unsigned int irq, void *dev_id);
92219 +
92220 +extern void irq_resume(void);
92221 +
92222 +/* Entry point for notifications into Linux subsystems. */
92223 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
92224 +
92225 +/* Entry point for notifications into the userland character device. */
92226 +extern void evtchn_device_upcall(int port);
92227 +
92228 +extern void mask_evtchn(int port);
92229 +extern void unmask_evtchn(int port);
92230 +
92231 +static inline void clear_evtchn(int port)
92232 +{
92233 +       shared_info_t *s = HYPERVISOR_shared_info;
92234 +       synch_clear_bit(port, &s->evtchn_pending[0]);
92235 +}
92236 +
92237 +static inline void notify_remote_via_evtchn(int port)
92238 +{
92239 +       struct evtchn_send send = { .port = port };
92240 +       (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
92241 +}
92242 +
92243 +/*
92244 + * Unlike notify_remote_via_evtchn(), this is safe to use across
92245 + * save/restore. Notifications on a broken connection are silently dropped.
92246 + */
92247 +extern void notify_remote_via_irq(int irq);
92248 +
92249 +#endif /* __ASM_EVTCHN_H__ */
92250 diff -ruNp linux-2.6.19/include/xen/features.h linux-2.6.19-xen-3.0.4/include/xen/features.h
92251 --- linux-2.6.19/include/xen/features.h 1970-01-01 00:00:00.000000000 +0000
92252 +++ linux-2.6.19-xen-3.0.4/include/xen/features.h       2007-02-02 19:11:00.000000000 +0000
92253 @@ -0,0 +1,20 @@
92254 +/******************************************************************************
92255 + * features.h
92256 + *
92257 + * Query the features reported by Xen.
92258 + *
92259 + * Copyright (c) 2006, Ian Campbell
92260 + */
92261 +
92262 +#ifndef __ASM_XEN_FEATURES_H__
92263 +#define __ASM_XEN_FEATURES_H__
92264 +
92265 +#include <xen/interface/version.h>
92266 +
92267 +extern void setup_xen_features(void);
92268 +
92269 +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
92270 +
92271 +#define xen_feature(flag)      (xen_features[flag])
92272 +
92273 +#endif /* __ASM_XEN_FEATURES_H__ */
92274 diff -ruNp linux-2.6.19/include/xen/foreign_page.h linux-2.6.19-xen-3.0.4/include/xen/foreign_page.h
92275 --- linux-2.6.19/include/xen/foreign_page.h     1970-01-01 00:00:00.000000000 +0000
92276 +++ linux-2.6.19-xen-3.0.4/include/xen/foreign_page.h   2007-02-02 19:11:00.000000000 +0000
92277 @@ -0,0 +1,30 @@
92278 +/******************************************************************************
92279 + * foreign_page.h
92280 + * 
92281 + * Provide a "foreign" page type, that is owned by a foreign allocator and 
92282 + * not the normal buddy allocator in page_alloc.c
92283 + * 
92284 + * Copyright (c) 2004, K A Fraser
92285 + */
92286 +
92287 +#ifndef __ASM_XEN_FOREIGN_PAGE_H__
92288 +#define __ASM_XEN_FOREIGN_PAGE_H__
92289 +
92290 +#define PG_foreign             PG_arch_1
92291 +
92292 +#define PageForeign(page)      test_bit(PG_foreign, &(page)->flags)
92293 +
92294 +#define SetPageForeign(page, dtor) do {                \
92295 +       set_bit(PG_foreign, &(page)->flags);    \
92296 +       (page)->mapping = (void *)dtor;         \
92297 +} while (0)
92298 +
92299 +#define ClearPageForeign(page) do {            \
92300 +       clear_bit(PG_foreign, &(page)->flags);  \
92301 +       (page)->mapping = NULL;                 \
92302 +} while (0)
92303 +
92304 +#define PageForeignDestructor(page)    \
92305 +       ( (void (*) (struct page *)) (page)->mapping )
92306 +
92307 +#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */
92308 diff -ruNp linux-2.6.19/include/xen/gnttab.h linux-2.6.19-xen-3.0.4/include/xen/gnttab.h
92309 --- linux-2.6.19/include/xen/gnttab.h   1970-01-01 00:00:00.000000000 +0000
92310 +++ linux-2.6.19-xen-3.0.4/include/xen/gnttab.h 2007-02-02 19:11:00.000000000 +0000
92311 @@ -0,0 +1,151 @@
92312 +/******************************************************************************
92313 + * gnttab.h
92314 + * 
92315 + * Two sets of functionality:
92316 + * 1. Granting foreign access to our memory reservation.
92317 + * 2. Accessing others' memory reservations via grant references.
92318 + * (i.e., mechanisms for both sender and recipient of grant references)
92319 + * 
92320 + * Copyright (c) 2004-2005, K A Fraser
92321 + * Copyright (c) 2005, Christopher Clark
92322 + * 
92323 + * This program is free software; you can redistribute it and/or
92324 + * modify it under the terms of the GNU General Public License version 2
92325 + * as published by the Free Software Foundation; or, when distributed
92326 + * separately from the Linux kernel or incorporated into other
92327 + * software packages, subject to the following license:
92328 + * 
92329 + * Permission is hereby granted, free of charge, to any person obtaining a copy
92330 + * of this source file (the "Software"), to deal in the Software without
92331 + * restriction, including without limitation the rights to use, copy, modify,
92332 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
92333 + * and to permit persons to whom the Software is furnished to do so, subject to
92334 + * the following conditions:
92335 + * 
92336 + * The above copyright notice and this permission notice shall be included in
92337 + * all copies or substantial portions of the Software.
92338 + * 
92339 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92340 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
92341 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
92342 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
92343 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92344 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
92345 + * IN THE SOFTWARE.
92346 + */
92347 +
92348 +#ifndef __ASM_GNTTAB_H__
92349 +#define __ASM_GNTTAB_H__
92350 +
92351 +#include <asm/hypervisor.h>
92352 +#include <asm/maddr.h> /* maddr_t */
92353 +#include <xen/interface/grant_table.h>
92354 +#include <xen/features.h>
92355 +
92356 +/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
92357 +#ifdef __ia64__
92358 +#define NR_GRANT_FRAMES 1
92359 +#else
92360 +#define NR_GRANT_FRAMES 4
92361 +#endif
92362 +
92363 +struct gnttab_free_callback {
92364 +       struct gnttab_free_callback *next;
92365 +       void (*fn)(void *);
92366 +       void *arg;
92367 +       u16 count;
92368 +};
92369 +
92370 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
92371 +                               int readonly);
92372 +
92373 +/*
92374 + * End access through the given grant reference, iff the grant entry is no
92375 + * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
92376 + * use.
92377 + */
92378 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
92379 +
92380 +/*
92381 + * Eventually end access through the given grant reference, and once that
92382 + * access has been ended, free the given page too.  Access will be ended
92383 + * immediately iff the grant entry is not in use, otherwise it will happen
92384 + * some time later.  page may be 0, in which case no freeing will occur.
92385 + */
92386 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
92387 +                              unsigned long page);
92388 +
92389 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
92390 +
92391 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
92392 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
92393 +
92394 +int gnttab_query_foreign_access(grant_ref_t ref);
92395 +
92396 +/*
92397 + * operations on reserved batches of grant references
92398 + */
92399 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
92400 +
92401 +void gnttab_free_grant_reference(grant_ref_t ref);
92402 +
92403 +void gnttab_free_grant_references(grant_ref_t head);
92404 +
92405 +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
92406 +
92407 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
92408 +
92409 +void gnttab_release_grant_reference(grant_ref_t *private_head,
92410 +                                   grant_ref_t release);
92411 +
92412 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
92413 +                                 void (*fn)(void *), void *arg, u16 count);
92414 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
92415 +
92416 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
92417 +                                    unsigned long frame, int readonly);
92418 +
92419 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
92420 +                                      unsigned long pfn);
92421 +
92422 +#ifdef __ia64__
92423 +#define gnttab_map_vaddr(map) __va(map.dev_bus_addr)
92424 +#else
92425 +#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
92426 +#endif
92427 +
92428 +int gnttab_suspend(void);
92429 +int gnttab_resume(void);
92430 +
92431 +static inline void
92432 +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
92433 +                 uint32_t flags, grant_ref_t ref, domid_t domid)
92434 +{
92435 +       if (flags & GNTMAP_contains_pte)
92436 +               map->host_addr = addr;
92437 +       else if (xen_feature(XENFEAT_auto_translated_physmap))
92438 +               map->host_addr = __pa(addr);
92439 +       else
92440 +               map->host_addr = addr;
92441 +
92442 +       map->flags = flags;
92443 +       map->ref = ref;
92444 +       map->dom = domid;
92445 +}
92446 +
92447 +static inline void
92448 +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
92449 +                   uint32_t flags, grant_handle_t handle)
92450 +{
92451 +       if (flags & GNTMAP_contains_pte)
92452 +               unmap->host_addr = addr;
92453 +       else if (xen_feature(XENFEAT_auto_translated_physmap))
92454 +               unmap->host_addr = __pa(addr);
92455 +       else
92456 +               unmap->host_addr = addr;
92457 +
92458 +       unmap->handle = handle;
92459 +       unmap->dev_bus_addr = 0;
92460 +}
92461 +
92462 +#endif /* __ASM_GNTTAB_H__ */
92463 diff -ruNp linux-2.6.19/include/xen/hvm.h linux-2.6.19-xen-3.0.4/include/xen/hvm.h
92464 --- linux-2.6.19/include/xen/hvm.h      1970-01-01 00:00:00.000000000 +0000
92465 +++ linux-2.6.19-xen-3.0.4/include/xen/hvm.h    2007-02-02 19:11:00.000000000 +0000
92466 @@ -0,0 +1,24 @@
92467 +/* Simple wrappers around HVM functions */
92468 +#ifndef XEN_HVM_H__
92469 +#define XEN_HVM_H__
92470 +
92471 +#include <xen/interface/hvm/params.h>
92472 +#include <asm/hypercall.h>
92473 +
92474 +static inline unsigned long hvm_get_parameter(int idx)
92475 +{
92476 +       struct xen_hvm_param xhv;
92477 +       int r;
92478 +
92479 +       xhv.domid = DOMID_SELF;
92480 +       xhv.index = idx;
92481 +       r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
92482 +       if (r < 0) {
92483 +               printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
92484 +                      idx, r);
92485 +               return 0;
92486 +       }
92487 +       return xhv.value;
92488 +}
92489 +
92490 +#endif /* XEN_HVM_H__ */
92491 diff -ruNp linux-2.6.19/include/xen/hypervisor_sysfs.h linux-2.6.19-xen-3.0.4/include/xen/hypervisor_sysfs.h
92492 --- linux-2.6.19/include/xen/hypervisor_sysfs.h 1970-01-01 00:00:00.000000000 +0000
92493 +++ linux-2.6.19-xen-3.0.4/include/xen/hypervisor_sysfs.h       2007-02-02 19:11:00.000000000 +0000
92494 @@ -0,0 +1,32 @@
92495 +/*
92496 + *  copyright (c) 2006 IBM Corporation
92497 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
92498 + *
92499 + *  This program is free software; you can redistribute it and/or modify
92500 + *  it under the terms of the GNU General Public License version 2 as
92501 + *  published by the Free Software Foundation.
92502 + */
92503 +
92504 +#ifndef _HYP_SYSFS_H_
92505 +#define _HYP_SYSFS_H_
92506 +
92507 +#include <linux/kobject.h>
92508 +#include <linux/sysfs.h>
92509 +
92510 +#define HYPERVISOR_ATTR_RO(_name) \
92511 +static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
92512 +
92513 +#define HYPERVISOR_ATTR_RW(_name) \
92514 +static struct hyp_sysfs_attr _name##_attr = \
92515 +       __ATTR(_name, 0644, _name##_show, _name##_store)
92516 +
92517 +extern struct subsystem hypervisor_subsys;
92518 +
92519 +struct hyp_sysfs_attr {
92520 +       struct attribute attr;
92521 +       ssize_t (*show)(struct hyp_sysfs_attr *, char *);
92522 +       ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
92523 +       void *hyp_attr_data;
92524 +};
92525 +
92526 +#endif /* _HYP_SYSFS_H_ */
92527 diff -ruNp linux-2.6.19/include/xen/interface/COPYING linux-2.6.19-xen-3.0.4/include/xen/interface/COPYING
92528 --- linux-2.6.19/include/xen/interface/COPYING  1970-01-01 00:00:00.000000000 +0000
92529 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/COPYING        2007-02-02 19:11:00.000000000 +0000
92530 @@ -0,0 +1,38 @@
92531 +XEN NOTICE
92532 +==========
92533 +
92534 +This copyright applies to all files within this subdirectory and its
92535 +subdirectories:
92536 +  include/public/*.h
92537 +  include/public/hvm/*.h
92538 +  include/public/io/*.h
92539 +
92540 +The intention is that these files can be freely copied into the source
92541 +tree of an operating system when porting that OS to run on Xen. Doing
92542 +so does *not* cause the OS to become subject to the terms of the GPL.
92543 +
92544 +All other files in the Xen source distribution are covered by version
92545 +2 of the GNU General Public License except where explicitly stated
92546 +otherwise within individual source files.
92547 +
92548 + -- Keir Fraser (on behalf of the Xen team)
92549 +
92550 +=====================================================================
92551 +
92552 +Permission is hereby granted, free of charge, to any person obtaining a copy
92553 +of this software and associated documentation files (the "Software"), to
92554 +deal in the Software without restriction, including without limitation the
92555 +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
92556 +sell copies of the Software, and to permit persons to whom the Software is
92557 +furnished to do so, subject to the following conditions:
92558 +
92559 +The above copyright notice and this permission notice shall be included in
92560 +all copies or substantial portions of the Software.
92561 +
92562 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
92563 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
92564 +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
92565 +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
92566 +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
92567 +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
92568 +DEALINGS IN THE SOFTWARE.
92569 diff -ruNp linux-2.6.19/include/xen/interface/acm.h linux-2.6.19-xen-3.0.4/include/xen/interface/acm.h
92570 --- linux-2.6.19/include/xen/interface/acm.h    1970-01-01 00:00:00.000000000 +0000
92571 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/acm.h  2007-02-02 19:11:00.000000000 +0000
92572 @@ -0,0 +1,205 @@
92573 +/*
92574 + * acm.h: Xen access control module interface defintions
92575 + *
92576 + * Permission is hereby granted, free of charge, to any person obtaining a copy
92577 + * of this software and associated documentation files (the "Software"), to
92578 + * deal in the Software without restriction, including without limitation the
92579 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
92580 + * sell copies of the Software, and to permit persons to whom the Software is
92581 + * furnished to do so, subject to the following conditions:
92582 + *
92583 + * The above copyright notice and this permission notice shall be included in
92584 + * all copies or substantial portions of the Software.
92585 + *
92586 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92587 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
92588 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
92589 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
92590 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92591 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
92592 + * DEALINGS IN THE SOFTWARE.
92593 + *
92594 + * Reiner Sailer <sailer@watson.ibm.com>
92595 + * Copyright (c) 2005, International Business Machines Corporation.
92596 + */
92597 +
92598 +#ifndef _XEN_PUBLIC_ACM_H
92599 +#define _XEN_PUBLIC_ACM_H
92600 +
92601 +#include "xen.h"
92602 +
92603 +/* if ACM_DEBUG defined, all hooks should
92604 + * print a short trace message (comment it out
92605 + * when not in testing mode )
92606 + */
92607 +/* #define ACM_DEBUG */
92608 +
92609 +#ifdef ACM_DEBUG
92610 +#  define printkd(fmt, args...) printk(fmt,## args)
92611 +#else
92612 +#  define printkd(fmt, args...)
92613 +#endif
92614 +
92615 +/* default ssid reference value if not supplied */
92616 +#define ACM_DEFAULT_SSID  0x0
92617 +#define ACM_DEFAULT_LOCAL_SSID  0x0
92618 +
92619 +/* Internal ACM ERROR types */
92620 +#define ACM_OK     0
92621 +#define ACM_UNDEF   -1
92622 +#define ACM_INIT_SSID_ERROR  -2
92623 +#define ACM_INIT_SOID_ERROR  -3
92624 +#define ACM_ERROR          -4
92625 +
92626 +/* External ACCESS DECISIONS */
92627 +#define ACM_ACCESS_PERMITTED        0
92628 +#define ACM_ACCESS_DENIED           -111
92629 +#define ACM_NULL_POINTER_ERROR      -200
92630 +
92631 +/* primary policy in lower 4 bits */
92632 +#define ACM_NULL_POLICY 0
92633 +#define ACM_CHINESE_WALL_POLICY 1
92634 +#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
92635 +#define ACM_POLICY_UNDEFINED 15
92636 +
92637 +/* combinations have secondary policy component in higher 4bit */
92638 +#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
92639 +    ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
92640 +
92641 +/* policy: */
92642 +#define ACM_POLICY_NAME(X) \
92643 + ((X) == (ACM_NULL_POLICY)) ? "NULL" :                        \
92644 +    ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" :        \
92645 +    ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
92646 +    ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
92647 +     "UNDEFINED"
92648 +
92649 +/* the following policy versions must be increased
92650 + * whenever the interpretation of the related
92651 + * policy's data structure changes
92652 + */
92653 +#define ACM_POLICY_VERSION 2
92654 +#define ACM_CHWALL_VERSION 1
92655 +#define ACM_STE_VERSION  1
92656 +
92657 +/* defines a ssid reference used by xen */
92658 +typedef uint32_t ssidref_t;
92659 +
92660 +/* hooks that are known to domains */
92661 +#define ACMHOOK_none    0
92662 +#define ACMHOOK_sharing 1
92663 +
92664 +/* -------security policy relevant type definitions-------- */
92665 +
92666 +/* type identifier; compares to "equal" or "not equal" */
92667 +typedef uint16_t domaintype_t;
92668 +
92669 +/* CHINESE WALL POLICY DATA STRUCTURES
92670 + *
92671 + * current accumulated conflict type set:
92672 + * When a domain is started and has a type that is in
92673 + * a conflict set, the conflicting types are incremented in
92674 + * the aggregate set. When a domain is destroyed, the 
92675 + * conflicting types to its type are decremented.
92676 + * If a domain has multiple types, this procedure works over
92677 + * all those types.
92678 + *
92679 + * conflict_aggregate_set[i] holds the number of
92680 + *   running domains that have a conflict with type i.
92681 + *
92682 + * running_types[i] holds the number of running domains
92683 + *        that include type i in their ssidref-referenced type set
92684 + *
92685 + * conflict_sets[i][j] is "0" if type j has no conflict
92686 + *    with type i and is "1" otherwise.
92687 + */
92688 +/* high-16 = version, low-16 = check magic */
92689 +#define ACM_MAGIC  0x0001debc
92690 +
92691 +/* each offset in bytes from start of the struct they
92692 + * are part of */
92693 +
92694 +/* each buffer consists of all policy information for
92695 + * the respective policy given in the policy code
92696 + *
92697 + * acm_policy_buffer, acm_chwall_policy_buffer,
92698 + * and acm_ste_policy_buffer need to stay 32-bit aligned
92699 + * because we create binary policies also with external
92700 + * tools that assume packed representations (e.g. the java tool)
92701 + */
92702 +struct acm_policy_buffer {
92703 +    uint32_t policy_version; /* ACM_POLICY_VERSION */
92704 +    uint32_t magic;
92705 +    uint32_t len;
92706 +    uint32_t policy_reference_offset;
92707 +    uint32_t primary_policy_code;
92708 +    uint32_t primary_buffer_offset;
92709 +    uint32_t secondary_policy_code;
92710 +    uint32_t secondary_buffer_offset;
92711 +};
92712 +
92713 +struct acm_policy_reference_buffer {
92714 +    uint32_t len;
92715 +};
92716 +
92717 +struct acm_chwall_policy_buffer {
92718 +    uint32_t policy_version; /* ACM_CHWALL_VERSION */
92719 +    uint32_t policy_code;
92720 +    uint32_t chwall_max_types;
92721 +    uint32_t chwall_max_ssidrefs;
92722 +    uint32_t chwall_max_conflictsets;
92723 +    uint32_t chwall_ssid_offset;
92724 +    uint32_t chwall_conflict_sets_offset;
92725 +    uint32_t chwall_running_types_offset;
92726 +    uint32_t chwall_conflict_aggregate_offset;
92727 +};
92728 +
92729 +struct acm_ste_policy_buffer {
92730 +    uint32_t policy_version; /* ACM_STE_VERSION */
92731 +    uint32_t policy_code;
92732 +    uint32_t ste_max_types;
92733 +    uint32_t ste_max_ssidrefs;
92734 +    uint32_t ste_ssid_offset;
92735 +};
92736 +
92737 +struct acm_stats_buffer {
92738 +    uint32_t magic;
92739 +    uint32_t len;
92740 +    uint32_t primary_policy_code;
92741 +    uint32_t primary_stats_offset;
92742 +    uint32_t secondary_policy_code;
92743 +    uint32_t secondary_stats_offset;
92744 +};
92745 +
92746 +struct acm_ste_stats_buffer {
92747 +    uint32_t ec_eval_count;
92748 +    uint32_t gt_eval_count;
92749 +    uint32_t ec_denied_count;
92750 +    uint32_t gt_denied_count;
92751 +    uint32_t ec_cachehit_count;
92752 +    uint32_t gt_cachehit_count;
92753 +};
92754 +
92755 +struct acm_ssid_buffer {
92756 +    uint32_t len;
92757 +    ssidref_t ssidref;
92758 +    uint32_t policy_reference_offset;
92759 +    uint32_t primary_policy_code;
92760 +    uint32_t primary_max_types;
92761 +    uint32_t primary_types_offset;
92762 +    uint32_t secondary_policy_code;
92763 +    uint32_t secondary_max_types;
92764 +    uint32_t secondary_types_offset;
92765 +};
92766 +
92767 +#endif
92768 +
92769 +/*
92770 + * Local variables:
92771 + * mode: C
92772 + * c-set-style: "BSD"
92773 + * c-basic-offset: 4
92774 + * tab-width: 4
92775 + * indent-tabs-mode: nil
92776 + * End:
92777 + */
92778 diff -ruNp linux-2.6.19/include/xen/interface/acm_ops.h linux-2.6.19-xen-3.0.4/include/xen/interface/acm_ops.h
92779 --- linux-2.6.19/include/xen/interface/acm_ops.h        1970-01-01 00:00:00.000000000 +0000
92780 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/acm_ops.h      2007-02-02 19:11:00.000000000 +0000
92781 @@ -0,0 +1,120 @@
92782 +/*
92783 + * acm_ops.h: Xen access control module hypervisor commands
92784 + *
92785 + * Permission is hereby granted, free of charge, to any person obtaining a copy
92786 + * of this software and associated documentation files (the "Software"), to
92787 + * deal in the Software without restriction, including without limitation the
92788 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
92789 + * sell copies of the Software, and to permit persons to whom the Software is
92790 + * furnished to do so, subject to the following conditions:
92791 + *
92792 + * The above copyright notice and this permission notice shall be included in
92793 + * all copies or substantial portions of the Software.
92794 + *
92795 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92796 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
92797 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
92798 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
92799 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92800 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
92801 + * DEALINGS IN THE SOFTWARE.
92802 + *
92803 + * Reiner Sailer <sailer@watson.ibm.com>
92804 + * Copyright (c) 2005,2006 International Business Machines Corporation.
92805 + */
92806 +
92807 +#ifndef __XEN_PUBLIC_ACM_OPS_H__
92808 +#define __XEN_PUBLIC_ACM_OPS_H__
92809 +
92810 +#include "xen.h"
92811 +#include "acm.h"
92812 +
92813 +/*
92814 + * Make sure you increment the interface version whenever you modify this file!
92815 + * This makes sure that old versions of acm tools will stop working in a
92816 + * well-defined way (rather than crashing the machine, for instance).
92817 + */
92818 +#define ACM_INTERFACE_VERSION   0xAAAA0008
92819 +
92820 +/************************************************************************/
92821 +
92822 +/*
92823 + * Prototype for this hypercall is:
92824 + *  int acm_op(int cmd, void *args)
92825 + * @cmd  == ACMOP_??? (access control module operation).
92826 + * @args == Operation-specific extra arguments (NULL if none).
92827 + */
92828 +
92829 +
92830 +#define ACMOP_setpolicy         1
92831 +struct acm_setpolicy {
92832 +    /* IN */
92833 +    uint32_t interface_version;
92834 +    XEN_GUEST_HANDLE(void) pushcache;
92835 +    uint32_t pushcache_size;
92836 +};
92837 +
92838 +
92839 +#define ACMOP_getpolicy         2
92840 +struct acm_getpolicy {
92841 +    /* IN */
92842 +    uint32_t interface_version;
92843 +    XEN_GUEST_HANDLE(void) pullcache;
92844 +    uint32_t pullcache_size;
92845 +};
92846 +
92847 +
92848 +#define ACMOP_dumpstats         3
92849 +struct acm_dumpstats {
92850 +    /* IN */
92851 +    uint32_t interface_version;
92852 +    XEN_GUEST_HANDLE(void) pullcache;
92853 +    uint32_t pullcache_size;
92854 +};
92855 +
92856 +
92857 +#define ACMOP_getssid           4
92858 +#define ACM_GETBY_ssidref  1
92859 +#define ACM_GETBY_domainid 2
92860 +struct acm_getssid {
92861 +    /* IN */
92862 +    uint32_t interface_version;
92863 +    uint32_t get_ssid_by; /* ACM_GETBY_* */
92864 +    union {
92865 +        domaintype_t domainid;
92866 +        ssidref_t    ssidref;
92867 +    } id;
92868 +    XEN_GUEST_HANDLE(void) ssidbuf;
92869 +    uint32_t ssidbuf_size;
92870 +};
92871 +
92872 +#define ACMOP_getdecision      5
92873 +struct acm_getdecision {
92874 +    /* IN */
92875 +    uint32_t interface_version;
92876 +    uint32_t get_decision_by1; /* ACM_GETBY_* */
92877 +    uint32_t get_decision_by2; /* ACM_GETBY_* */
92878 +    union {
92879 +        domaintype_t domainid;
92880 +        ssidref_t    ssidref;
92881 +    } id1;
92882 +    union {
92883 +        domaintype_t domainid;
92884 +        ssidref_t    ssidref;
92885 +    } id2;
92886 +    uint32_t hook;
92887 +    /* OUT */
92888 +    uint32_t acm_decision;
92889 +};
92890 +
92891 +#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
92892 +
92893 +/*
92894 + * Local variables:
92895 + * mode: C
92896 + * c-set-style: "BSD"
92897 + * c-basic-offset: 4
92898 + * tab-width: 4
92899 + * indent-tabs-mode: nil
92900 + * End:
92901 + */
92902 diff -ruNp linux-2.6.19/include/xen/interface/arch-ia64.h linux-2.6.19-xen-3.0.4/include/xen/interface/arch-ia64.h
92903 --- linux-2.6.19/include/xen/interface/arch-ia64.h      1970-01-01 00:00:00.000000000 +0000
92904 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/arch-ia64.h    2007-02-02 19:11:00.000000000 +0000
92905 @@ -0,0 +1,500 @@
92906 +/******************************************************************************
92907 + * arch-ia64/hypervisor-if.h
92908 + * 
92909 + * Guest OS interface to IA64 Xen.
92910 + *
92911 + * Permission is hereby granted, free of charge, to any person obtaining a copy
92912 + * of this software and associated documentation files (the "Software"), to
92913 + * deal in the Software without restriction, including without limitation the
92914 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
92915 + * sell copies of the Software, and to permit persons to whom the Software is
92916 + * furnished to do so, subject to the following conditions:
92917 + *
92918 + * The above copyright notice and this permission notice shall be included in
92919 + * all copies or substantial portions of the Software.
92920 + *
92921 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92922 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
92923 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
92924 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
92925 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92926 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
92927 + * DEALINGS IN THE SOFTWARE.
92928 + *
92929 + */
92930 +
92931 +#ifndef __HYPERVISOR_IF_IA64_H__
92932 +#define __HYPERVISOR_IF_IA64_H__
92933 +
92934 +/* Structural guest handles introduced in 0x00030201. */
92935 +#if __XEN_INTERFACE_VERSION__ >= 0x00030201
92936 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
92937 +    typedef struct { type *p; } __guest_handle_ ## name
92938 +#else
92939 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
92940 +    typedef type * __guest_handle_ ## name
92941 +#endif
92942 +
92943 +#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
92944 +#define XEN_GUEST_HANDLE(name)          __guest_handle_ ## name
92945 +#define set_xen_guest_handle(hnd, val)  do { (hnd).p = val; } while (0)
92946 +#ifdef __XEN_TOOLS__
92947 +#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
92948 +#endif
92949 +
92950 +#ifndef __ASSEMBLY__
92951 +/* Guest handles for primitive C types. */
92952 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
92953 +__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
92954 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
92955 +__DEFINE_XEN_GUEST_HANDLE(u64,   unsigned long);
92956 +DEFINE_XEN_GUEST_HANDLE(char);
92957 +DEFINE_XEN_GUEST_HANDLE(int);
92958 +DEFINE_XEN_GUEST_HANDLE(long);
92959 +DEFINE_XEN_GUEST_HANDLE(void);
92960 +
92961 +typedef unsigned long xen_pfn_t;
92962 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
92963 +#endif
92964 +
92965 +/* Arch specific VIRQs definition */
92966 +#define VIRQ_ITC        VIRQ_ARCH_0 /* V. Virtual itc timer */
92967 +#define VIRQ_MCA_CMC    VIRQ_ARCH_1 /* MCA cmc interrupt */
92968 +#define VIRQ_MCA_CPE    VIRQ_ARCH_2 /* MCA cpe interrupt */
92969 +
92970 +/* Maximum number of virtual CPUs in multi-processor guests. */
92971 +/* WARNING: before changing this, check that shared_info fits on a page */
92972 +#define MAX_VIRT_CPUS 64
92973 +
92974 +#ifndef __ASSEMBLY__
92975 +
92976 +typedef unsigned long xen_ulong_t;
92977 +
92978 +#define INVALID_MFN       (~0UL)
92979 +
92980 +#define MEM_G   (1UL << 30)
92981 +#define MEM_M   (1UL << 20)
92982 +
92983 +#define MMIO_START       (3 * MEM_G)
92984 +#define MMIO_SIZE        (512 * MEM_M)
92985 +
92986 +#define VGA_IO_START     0xA0000UL
92987 +#define VGA_IO_SIZE      0x20000
92988 +
92989 +#define LEGACY_IO_START  (MMIO_START + MMIO_SIZE)
92990 +#define LEGACY_IO_SIZE   (64*MEM_M)
92991 +
92992 +#define IO_PAGE_START (LEGACY_IO_START + LEGACY_IO_SIZE)
92993 +#define IO_PAGE_SIZE  PAGE_SIZE
92994 +
92995 +#define STORE_PAGE_START (IO_PAGE_START + IO_PAGE_SIZE)
92996 +#define STORE_PAGE_SIZE         PAGE_SIZE
92997 +
92998 +#define BUFFER_IO_PAGE_START (STORE_PAGE_START+PAGE_SIZE)
92999 +#define BUFFER_IO_PAGE_SIZE PAGE_SIZE
93000 +
93001 +#define IO_SAPIC_START   0xfec00000UL
93002 +#define IO_SAPIC_SIZE    0x100000
93003 +
93004 +#define PIB_START 0xfee00000UL
93005 +#define PIB_SIZE 0x200000
93006 +
93007 +#define GFW_START        (4*MEM_G -16*MEM_M)
93008 +#define GFW_SIZE         (16*MEM_M)
93009 +
93010 +struct pt_fpreg {
93011 +    union {
93012 +        unsigned long bits[2];
93013 +        long double __dummy;    /* force 16-byte alignment */
93014 +    } u;
93015 +};
93016 +
93017 +struct cpu_user_regs {
93018 +    /* The following registers are saved by SAVE_MIN: */
93019 +    unsigned long b6;  /* scratch */
93020 +    unsigned long b7;  /* scratch */
93021 +
93022 +    unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
93023 +    unsigned long ar_ssd; /* reserved for future use (scratch) */
93024 +
93025 +    unsigned long r8;  /* scratch (return value register 0) */
93026 +    unsigned long r9;  /* scratch (return value register 1) */
93027 +    unsigned long r10; /* scratch (return value register 2) */
93028 +    unsigned long r11; /* scratch (return value register 3) */
93029 +
93030 +    unsigned long cr_ipsr; /* interrupted task's psr */
93031 +    unsigned long cr_iip;  /* interrupted task's instruction pointer */
93032 +    unsigned long cr_ifs;  /* interrupted task's function state */
93033 +
93034 +    unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
93035 +    unsigned long ar_pfs;  /* prev function state  */
93036 +    unsigned long ar_rsc;  /* RSE configuration */
93037 +    /* The following two are valid only if cr_ipsr.cpl > 0: */
93038 +    unsigned long ar_rnat;  /* RSE NaT */
93039 +    unsigned long ar_bspstore; /* RSE bspstore */
93040 +
93041 +    unsigned long pr;  /* 64 predicate registers (1 bit each) */
93042 +    unsigned long b0;  /* return pointer (bp) */
93043 +    unsigned long loadrs;  /* size of dirty partition << 16 */
93044 +
93045 +    unsigned long r1;  /* the gp pointer */
93046 +    unsigned long r12; /* interrupted task's memory stack pointer */
93047 +    unsigned long r13; /* thread pointer */
93048 +
93049 +    unsigned long ar_fpsr;  /* floating point status (preserved) */
93050 +    unsigned long r15;  /* scratch */
93051 +
93052 + /* The remaining registers are NOT saved for system calls.  */
93053 +
93054 +    unsigned long r14;  /* scratch */
93055 +    unsigned long r2;  /* scratch */
93056 +    unsigned long r3;  /* scratch */
93057 +    unsigned long r16;  /* scratch */
93058 +    unsigned long r17;  /* scratch */
93059 +    unsigned long r18;  /* scratch */
93060 +    unsigned long r19;  /* scratch */
93061 +    unsigned long r20;  /* scratch */
93062 +    unsigned long r21;  /* scratch */
93063 +    unsigned long r22;  /* scratch */
93064 +    unsigned long r23;  /* scratch */
93065 +    unsigned long r24;  /* scratch */
93066 +    unsigned long r25;  /* scratch */
93067 +    unsigned long r26;  /* scratch */
93068 +    unsigned long r27;  /* scratch */
93069 +    unsigned long r28;  /* scratch */
93070 +    unsigned long r29;  /* scratch */
93071 +    unsigned long r30;  /* scratch */
93072 +    unsigned long r31;  /* scratch */
93073 +    unsigned long ar_ccv;  /* compare/exchange value (scratch) */
93074 +
93075 +    /*
93076 +     * Floating point registers that the kernel considers scratch:
93077 +     */
93078 +    struct pt_fpreg f6;  /* scratch */
93079 +    struct pt_fpreg f7;  /* scratch */
93080 +    struct pt_fpreg f8;  /* scratch */
93081 +    struct pt_fpreg f9;  /* scratch */
93082 +    struct pt_fpreg f10;  /* scratch */
93083 +    struct pt_fpreg f11;  /* scratch */
93084 +    unsigned long r4;  /* preserved */
93085 +    unsigned long r5;  /* preserved */
93086 +    unsigned long r6;  /* preserved */
93087 +    unsigned long r7;  /* preserved */
93088 +    unsigned long eml_unat;    /* used for emulating instruction */
93089 +    unsigned long pad0;     /* alignment pad */
93090 +
93091 +};
93092 +typedef struct cpu_user_regs cpu_user_regs_t;
93093 +
93094 +union vac {
93095 +    unsigned long value;
93096 +    struct {
93097 +        int a_int:1;
93098 +        int a_from_int_cr:1;
93099 +        int a_to_int_cr:1;
93100 +        int a_from_psr:1;
93101 +        int a_from_cpuid:1;
93102 +        int a_cover:1;
93103 +        int a_bsw:1;
93104 +        long reserved:57;
93105 +    };
93106 +};
93107 +typedef union vac vac_t;
93108 +
93109 +union vdc {
93110 +    unsigned long value;
93111 +    struct {
93112 +        int d_vmsw:1;
93113 +        int d_extint:1;
93114 +        int d_ibr_dbr:1;
93115 +        int d_pmc:1;
93116 +        int d_to_pmd:1;
93117 +        int d_itm:1;
93118 +        long reserved:58;
93119 +    };
93120 +};
93121 +typedef union vdc vdc_t;
93122 +
93123 +struct mapped_regs {
93124 +    union vac   vac;
93125 +    union vdc   vdc;
93126 +    unsigned long  virt_env_vaddr;
93127 +    unsigned long  reserved1[29];
93128 +    unsigned long  vhpi;
93129 +    unsigned long  reserved2[95];
93130 +    union {
93131 +        unsigned long  vgr[16];
93132 +        unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active
93133 +    };
93134 +    union {
93135 +        unsigned long  vbgr[16];
93136 +        unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active
93137 +    };
93138 +    unsigned long  vnat;
93139 +    unsigned long  vbnat;
93140 +    unsigned long  vcpuid[5];
93141 +    unsigned long  reserved3[11];
93142 +    unsigned long  vpsr;
93143 +    unsigned long  vpr;
93144 +    unsigned long  reserved4[76];
93145 +    union {
93146 +        unsigned long  vcr[128];
93147 +        struct {
93148 +            unsigned long dcr;  // CR0
93149 +            unsigned long itm;
93150 +            unsigned long iva;
93151 +            unsigned long rsv1[5];
93152 +            unsigned long pta;  // CR8
93153 +            unsigned long rsv2[7];
93154 +            unsigned long ipsr;  // CR16
93155 +            unsigned long isr;
93156 +            unsigned long rsv3;
93157 +            unsigned long iip;
93158 +            unsigned long ifa;
93159 +            unsigned long itir;
93160 +            unsigned long iipa;
93161 +            unsigned long ifs;
93162 +            unsigned long iim;  // CR24
93163 +            unsigned long iha;
93164 +            unsigned long rsv4[38];
93165 +            unsigned long lid;  // CR64
93166 +            unsigned long ivr;
93167 +            unsigned long tpr;
93168 +            unsigned long eoi;
93169 +            unsigned long irr[4];
93170 +            unsigned long itv;  // CR72
93171 +            unsigned long pmv;
93172 +            unsigned long cmcv;
93173 +            unsigned long rsv5[5];
93174 +            unsigned long lrr0;  // CR80
93175 +            unsigned long lrr1;
93176 +            unsigned long rsv6[46];
93177 +        };
93178 +    };
93179 +    union {
93180 +        unsigned long  reserved5[128];
93181 +        struct {
93182 +            unsigned long precover_ifs;
93183 +            unsigned long unat;  // not sure if this is needed until NaT arch is done
93184 +            int interrupt_collection_enabled; // virtual psr.ic
93185 +            /* virtual interrupt deliverable flag is evtchn_upcall_mask in
93186 +             * shared info area now. interrupt_mask_addr is the address
93187 +             * of evtchn_upcall_mask for current vcpu
93188 +             */
93189 +            unsigned char *interrupt_mask_addr;
93190 +            int pending_interruption;
93191 +            int incomplete_regframe; // see SDM vol2 6.8
93192 +            unsigned char vpsr_pp;
93193 +            unsigned char reserved5_2[7];
93194 +            unsigned long reserved5_1[3];
93195 +            int metaphysical_mode; // 1 = use metaphys mapping, 0 = use virtual
93196 +            int banknum; // 0 or 1, which virtual register bank is active
93197 +            unsigned long rrs[8]; // region registers
93198 +            unsigned long krs[8]; // kernel registers
93199 +            unsigned long pkrs[8]; // protection key registers
93200 +            unsigned long tmp[8]; // temp registers (e.g. for hyperprivops)
93201 +        };
93202 +    };
93203 +};
93204 +typedef struct mapped_regs mapped_regs_t;
93205 +
93206 +struct vpd {
93207 +    struct mapped_regs vpd_low;
93208 +    unsigned long  reserved6[3456];
93209 +    unsigned long  vmm_avail[128];
93210 +    unsigned long  reserved7[4096];
93211 +};
93212 +typedef struct vpd vpd_t;
93213 +
93214 +struct arch_vcpu_info {
93215 +};
93216 +typedef struct arch_vcpu_info arch_vcpu_info_t;
93217 +
93218 +struct arch_shared_info {
93219 +    /* PFN of the start_info page.  */
93220 +    unsigned long start_info_pfn;
93221 +
93222 +    /* Interrupt vector for event channel.  */
93223 +    int evtchn_vector;
93224 +
93225 +    uint64_t pad[32];
93226 +};
93227 +typedef struct arch_shared_info arch_shared_info_t;
93228 +
93229 +typedef unsigned long xen_callback_t;
93230 +
93231 +struct ia64_tr_entry {
93232 +    unsigned long pte;
93233 +    unsigned long itir;
93234 +    unsigned long vadr;
93235 +    unsigned long rid;
93236 +};
93237 +
93238 +struct vcpu_extra_regs {
93239 +    struct ia64_tr_entry itrs[8];
93240 +    struct ia64_tr_entry dtrs[8];
93241 +    unsigned long iva;
93242 +    unsigned long dcr;
93243 +    unsigned long event_callback_ip;
93244 +};
93245 +
93246 +struct vcpu_guest_context {
93247 +#define VGCF_EXTRA_REGS (1<<1) /* Get/Set extra regs.  */
93248 +    unsigned long flags;       /* VGCF_* flags */
93249 +
93250 +    struct cpu_user_regs user_regs;
93251 +    struct vcpu_extra_regs extra_regs;
93252 +    unsigned long privregs_pfn;
93253 +};
93254 +typedef struct vcpu_guest_context vcpu_guest_context_t;
93255 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
93256 +
93257 +/* dom0 vp op */
93258 +#define __HYPERVISOR_ia64_dom0vp_op     __HYPERVISOR_arch_0
93259 +/*  Map io space in machine address to dom0 physical address space.
93260 +    Currently physical assigned address equals to machine address.  */
93261 +#define IA64_DOM0VP_ioremap             0
93262 +
93263 +/* Convert a pseudo physical page frame number to the corresponding
93264 +   machine page frame number. If no page is assigned, INVALID_MFN or
93265 +   GPFN_INV_MASK is returned depending on domain's non-vti/vti mode.  */
93266 +#define IA64_DOM0VP_phystomach          1
93267 +
93268 +/* Convert a machine page frame number to the corresponding pseudo physical
93269 +   page frame number of the caller domain.  */
93270 +#define IA64_DOM0VP_machtophys          3
93271 +
93272 +/* Reserved for future use.  */
93273 +#define IA64_DOM0VP_iounmap             4
93274 +
93275 +/* Unmap and free pages contained in the specified pseudo physical region.  */
93276 +#define IA64_DOM0VP_zap_physmap         5
93277 +
93278 +/* Assign machine page frame to dom0's pseudo physical address space.  */
93279 +#define IA64_DOM0VP_add_physmap         6
93280 +
93281 +/* expose the p2m table into domain */
93282 +#define IA64_DOM0VP_expose_p2m          7
93283 +
93284 +/* xen perfmon */
93285 +#define IA64_DOM0VP_perfmon             8
93286 +
93287 +/* gmfn version of IA64_DOM0VP_add_physmap */
93288 +#define IA64_DOM0VP_add_physmap_with_gmfn       9
93289 +
93290 +// flags for page assignement to pseudo physical address space
93291 +#define _ASSIGN_readonly                0
93292 +#define ASSIGN_readonly                 (1UL << _ASSIGN_readonly)
93293 +#define ASSIGN_writable                 (0UL << _ASSIGN_readonly) // dummy flag
93294 +/* Internal only: memory attribute must be WC/UC/UCE.  */
93295 +#define _ASSIGN_nocache                 1
93296 +#define ASSIGN_nocache                  (1UL << _ASSIGN_nocache)
93297 +// tlb tracking
93298 +#define _ASSIGN_tlb_track               2
93299 +#define ASSIGN_tlb_track                (1UL << _ASSIGN_tlb_track)
93300 +/* Internal only: associated with PGC_allocated bit */
93301 +#define _ASSIGN_pgc_allocated           3
93302 +#define ASSIGN_pgc_allocated            (1UL << _ASSIGN_pgc_allocated)
93303 +
93304 +/* This structure has the same layout of struct ia64_boot_param, defined in
93305 +   <asm/system.h>.  It is redefined here to ease use.  */
93306 +struct xen_ia64_boot_param {
93307 +       unsigned long command_line;     /* physical address of cmd line args */
93308 +       unsigned long efi_systab;       /* physical address of EFI system table */
93309 +       unsigned long efi_memmap;       /* physical address of EFI memory map */
93310 +       unsigned long efi_memmap_size;  /* size of EFI memory map */
93311 +       unsigned long efi_memdesc_size; /* size of an EFI memory map descriptor */
93312 +       unsigned int  efi_memdesc_version;      /* memory descriptor version */
93313 +       struct {
93314 +               unsigned short num_cols;        /* number of columns on console.  */
93315 +               unsigned short num_rows;        /* number of rows on console.  */
93316 +               unsigned short orig_x;  /* cursor's x position */
93317 +               unsigned short orig_y;  /* cursor's y position */
93318 +       } console_info;
93319 +       unsigned long fpswa;            /* physical address of the fpswa interface */
93320 +       unsigned long initrd_start;
93321 +       unsigned long initrd_size;
93322 +       unsigned long domain_start;     /* va where the boot time domain begins */
93323 +       unsigned long domain_size;      /* how big is the boot domain */
93324 +};
93325 +
93326 +#endif /* !__ASSEMBLY__ */
93327 +
93328 +/* Size of the shared_info area (this is not related to page size).  */
93329 +#define XSI_SHIFT                      14
93330 +#define XSI_SIZE                       (1 << XSI_SHIFT)
93331 +/* Log size of mapped_regs area (64 KB - only 4KB is used).  */
93332 +#define XMAPPEDREGS_SHIFT              12
93333 +#define XMAPPEDREGS_SIZE               (1 << XMAPPEDREGS_SHIFT)
93334 +/* Offset of XASI (Xen arch shared info) wrt XSI_BASE.  */
93335 +#define XMAPPEDREGS_OFS                        XSI_SIZE
93336 +
93337 +/* Hyperprivops.  */
93338 +#define HYPERPRIVOP_RFI                        0x1
93339 +#define HYPERPRIVOP_RSM_DT             0x2
93340 +#define HYPERPRIVOP_SSM_DT             0x3
93341 +#define HYPERPRIVOP_COVER              0x4
93342 +#define HYPERPRIVOP_ITC_D              0x5
93343 +#define HYPERPRIVOP_ITC_I              0x6
93344 +#define HYPERPRIVOP_SSM_I              0x7
93345 +#define HYPERPRIVOP_GET_IVR            0x8
93346 +#define HYPERPRIVOP_GET_TPR            0x9
93347 +#define HYPERPRIVOP_SET_TPR            0xa
93348 +#define HYPERPRIVOP_EOI                        0xb
93349 +#define HYPERPRIVOP_SET_ITM            0xc
93350 +#define HYPERPRIVOP_THASH              0xd
93351 +#define HYPERPRIVOP_PTC_GA             0xe
93352 +#define HYPERPRIVOP_ITR_D              0xf
93353 +#define HYPERPRIVOP_GET_RR             0x10
93354 +#define HYPERPRIVOP_SET_RR             0x11
93355 +#define HYPERPRIVOP_SET_KR             0x12
93356 +#define HYPERPRIVOP_FC                 0x13
93357 +#define HYPERPRIVOP_GET_CPUID          0x14
93358 +#define HYPERPRIVOP_GET_PMD            0x15
93359 +#define HYPERPRIVOP_GET_EFLAG          0x16
93360 +#define HYPERPRIVOP_SET_EFLAG          0x17
93361 +#define HYPERPRIVOP_RSM_BE             0x18
93362 +#define HYPERPRIVOP_GET_PSR            0x19
93363 +#define HYPERPRIVOP_MAX                        0x19
93364 +
93365 +/* Fast and light hypercalls.  */
93366 +#define __HYPERVISOR_ia64_fast_eoi     0x0200
93367 +
93368 +/* Xencomm macros.  */
93369 +#define XENCOMM_INLINE_MASK 0xf800000000000000UL
93370 +#define XENCOMM_INLINE_FLAG 0x8000000000000000UL
93371 +
93372 +#define XENCOMM_IS_INLINE(addr) \
93373 +  (((unsigned long)(addr) & XENCOMM_INLINE_MASK) == XENCOMM_INLINE_FLAG)
93374 +#define XENCOMM_INLINE_ADDR(addr) \
93375 +  ((unsigned long)(addr) & ~XENCOMM_INLINE_MASK)
93376 +
93377 +/* xen perfmon */
93378 +#ifdef XEN
93379 +#ifndef __ASSEMBLY__
93380 +#ifndef _ASM_IA64_PERFMON_H
93381 +
93382 +#include <xen/list.h>   // asm/perfmon.h requires struct list_head
93383 +#include <asm/perfmon.h>
93384 +// for PFM_xxx and pfarg_features_t, pfarg_context_t, pfarg_reg_t, pfarg_load_t
93385 +
93386 +#endif /* _ASM_IA64_PERFMON_H */
93387 +
93388 +DEFINE_XEN_GUEST_HANDLE(pfarg_features_t);
93389 +DEFINE_XEN_GUEST_HANDLE(pfarg_context_t);
93390 +DEFINE_XEN_GUEST_HANDLE(pfarg_reg_t);
93391 +DEFINE_XEN_GUEST_HANDLE(pfarg_load_t);
93392 +#endif /* __ASSEMBLY__ */
93393 +#endif /* XEN */
93394 +
93395 +#endif /* __HYPERVISOR_IF_IA64_H__ */
93396 +
93397 +/*
93398 + * Local variables:
93399 + * mode: C
93400 + * c-set-style: "BSD"
93401 + * c-basic-offset: 4
93402 + * tab-width: 4
93403 + * indent-tabs-mode: nil
93404 + * End:
93405 + */
93406 diff -ruNp linux-2.6.19/include/xen/interface/arch-powerpc.h linux-2.6.19-xen-3.0.4/include/xen/interface/arch-powerpc.h
93407 --- linux-2.6.19/include/xen/interface/arch-powerpc.h   1970-01-01 00:00:00.000000000 +0000
93408 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/arch-powerpc.h 2007-02-02 19:11:00.000000000 +0000
93409 @@ -0,0 +1,121 @@
93410 +/*
93411 + * Permission is hereby granted, free of charge, to any person obtaining a copy
93412 + * of this software and associated documentation files (the "Software"), to
93413 + * deal in the Software without restriction, including without limitation the
93414 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
93415 + * sell copies of the Software, and to permit persons to whom the Software is
93416 + * furnished to do so, subject to the following conditions:
93417 + *
93418 + * The above copyright notice and this permission notice shall be included in
93419 + * all copies or substantial portions of the Software.
93420 + *
93421 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
93422 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
93423 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
93424 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
93425 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
93426 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
93427 + * DEALINGS IN THE SOFTWARE.
93428 + *
93429 + * Copyright (C) IBM Corp. 2005, 2006
93430 + *
93431 + * Authors: Hollis Blanchard <hollisb@us.ibm.com>
93432 + */
93433 +
93434 +#ifndef __XEN_PUBLIC_ARCH_PPC_64_H__
93435 +#define __XEN_PUBLIC_ARCH_PPC_64_H__
93436 +
93437 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
93438 +    typedef struct { \
93439 +        int __pad[(sizeof (long long) - sizeof (void *)) / sizeof (int)]; \
93440 +        type *p; \
93441 +    } __attribute__((__aligned__(8))) __guest_handle_ ## name
93442 +
93443 +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
93444 +#define XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
93445 +#define set_xen_guest_handle(hnd, val) \
93446 +    do { \
93447 +        if (sizeof ((hnd).__pad)) \
93448 +            (hnd).__pad[0] = 0; \
93449 +        (hnd).p = val; \
93450 +    } while (0)
93451 +
93452 +#ifdef __XEN_TOOLS__
93453 +#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
93454 +#endif
93455 +
93456 +#ifndef __ASSEMBLY__
93457 +/* Guest handles for primitive C types. */
93458 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
93459 +__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
93460 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
93461 +DEFINE_XEN_GUEST_HANDLE(char);
93462 +DEFINE_XEN_GUEST_HANDLE(int);
93463 +DEFINE_XEN_GUEST_HANDLE(long);
93464 +DEFINE_XEN_GUEST_HANDLE(void);
93465 +
93466 +typedef unsigned long long xen_pfn_t;
93467 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
93468 +#endif
93469 +
93470 +/*
93471 + * Pointers and other address fields inside interface structures are padded to
93472 + * 64 bits. This means that field alignments aren't different between 32- and
93473 + * 64-bit architectures. 
93474 + */
93475 +/* NB. Multi-level macro ensures __LINE__ is expanded before concatenation. */
93476 +#define __MEMORY_PADDING(_X)
93477 +#define _MEMORY_PADDING(_X)  __MEMORY_PADDING(_X)
93478 +#define MEMORY_PADDING       _MEMORY_PADDING(__LINE__)
93479 +
93480 +/* And the trap vector is... */
93481 +#define TRAP_INSTR "li 0,-1; sc" /* XXX just "sc"? */
93482 +
93483 +#ifndef __ASSEMBLY__
93484 +
93485 +#define XENCOMM_INLINE_FLAG (1UL << 63)
93486 +
93487 +typedef uint64_t xen_ulong_t;
93488 +
93489 +/* User-accessible registers: need to be saved/restored for every nested Xen
93490 + * invocation. */
93491 +struct cpu_user_regs
93492 +{
93493 +    uint64_t gprs[32];
93494 +    uint64_t lr;
93495 +    uint64_t ctr;
93496 +    uint64_t srr0;
93497 +    uint64_t srr1;
93498 +    uint64_t pc;
93499 +    uint64_t msr;
93500 +    uint64_t fpscr;
93501 +    uint64_t xer;
93502 +    uint64_t hid4;
93503 +    uint32_t cr;
93504 +    uint32_t entry_vector;
93505 +};
93506 +typedef struct cpu_user_regs cpu_user_regs_t;
93507 +
93508 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ /* XXX timebase */
93509 +
93510 +/* ONLY used to communicate with dom0! See also struct exec_domain. */
93511 +struct vcpu_guest_context {
93512 +    cpu_user_regs_t user_regs;         /* User-level CPU registers     */
93513 +    uint64_t sdr1;                     /* Pagetable base               */
93514 +    /* XXX etc */
93515 +};
93516 +typedef struct vcpu_guest_context vcpu_guest_context_t;
93517 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
93518 +
93519 +struct arch_shared_info {
93520 +    uint64_t pad[32];
93521 +};
93522 +
93523 +struct arch_vcpu_info {
93524 +};
93525 +
93526 +/* Support for multi-processor guests. */
93527 +#define MAX_VIRT_CPUS 32
93528 +#endif
93529 +
93530 +#endif
93531 diff -ruNp linux-2.6.19/include/xen/interface/arch-x86/xen-x86_32.h linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86/xen-x86_32.h
93532 --- linux-2.6.19/include/xen/interface/arch-x86/xen-x86_32.h    1970-01-01 00:00:00.000000000 +0000
93533 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86/xen-x86_32.h  2007-02-02 19:11:00.000000000 +0000
93534 @@ -0,0 +1,151 @@
93535 +/******************************************************************************
93536 + * xen-x86_32.h
93537 + * 
93538 + * Guest OS interface to x86 32-bit Xen.
93539 + * 
93540 + * Permission is hereby granted, free of charge, to any person obtaining a copy
93541 + * of this software and associated documentation files (the "Software"), to
93542 + * deal in the Software without restriction, including without limitation the
93543 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
93544 + * sell copies of the Software, and to permit persons to whom the Software is
93545 + * furnished to do so, subject to the following conditions:
93546 + *
93547 + * The above copyright notice and this permission notice shall be included in
93548 + * all copies or substantial portions of the Software.
93549 + *
93550 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
93551 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
93552 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
93553 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
93554 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
93555 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
93556 + * DEALINGS IN THE SOFTWARE.
93557 + *
93558 + * Copyright (c) 2004-2006, K A Fraser
93559 + */
93560 +
93561 +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
93562 +#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
93563 +
93564 +/*
93565 + * Hypercall interface:
93566 + *  Input:  %ebx, %ecx, %edx, %esi, %edi (arguments 1-5)
93567 + *  Output: %eax
93568 + * Access is via hypercall page (set up by guest loader or via a Xen MSR):
93569 + *  call hypercall_page + hypercall-number * 32
93570 + * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx)
93571 + */
93572 +
93573 +#if __XEN_INTERFACE_VERSION__ < 0x00030203
93574 +/*
93575 + * Legacy hypercall interface:
93576 + * As above, except the entry sequence to the hypervisor is:
93577 + *  mov $hypercall-number*32,%eax ; int $0x82
93578 + */
93579 +#define TRAP_INSTR "int $0x82"
93580 +#endif
93581 +
93582 +/*
93583 + * These flat segments are in the Xen-private section of every GDT. Since these
93584 + * are also present in the initial GDT, many OSes will be able to avoid
93585 + * installing their own GDT.
93586 + */
93587 +#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
93588 +#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
93589 +#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
93590 +#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
93591 +#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
93592 +#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
93593 +
93594 +#define FLAT_KERNEL_CS FLAT_RING1_CS
93595 +#define FLAT_KERNEL_DS FLAT_RING1_DS
93596 +#define FLAT_KERNEL_SS FLAT_RING1_SS
93597 +#define FLAT_USER_CS    FLAT_RING3_CS
93598 +#define FLAT_USER_DS    FLAT_RING3_DS
93599 +#define FLAT_USER_SS    FLAT_RING3_SS
93600 +
93601 +/*
93602 + * Virtual addresses beyond this are not modifiable by guest OSes. The 
93603 + * machine->physical mapping table starts at this address, read-only.
93604 + */
93605 +#ifdef CONFIG_X86_PAE
93606 +#define __HYPERVISOR_VIRT_START 0xF5800000
93607 +#define __MACH2PHYS_VIRT_START  0xF5800000
93608 +#define __MACH2PHYS_VIRT_END    0xF6800000
93609 +#else
93610 +#define __HYPERVISOR_VIRT_START 0xFC000000
93611 +#define __MACH2PHYS_VIRT_START  0xFC000000
93612 +#define __MACH2PHYS_VIRT_END    0xFC400000
93613 +#endif
93614 +
93615 +#ifndef HYPERVISOR_VIRT_START
93616 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
93617 +#endif
93618 +
93619 +#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
93620 +#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
93621 +#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
93622 +#ifndef machine_to_phys_mapping
93623 +#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
93624 +#endif
93625 +
93626 +#ifndef __ASSEMBLY__
93627 +
93628 +struct cpu_user_regs {
93629 +    uint32_t ebx;
93630 +    uint32_t ecx;
93631 +    uint32_t edx;
93632 +    uint32_t esi;
93633 +    uint32_t edi;
93634 +    uint32_t ebp;
93635 +    uint32_t eax;
93636 +    uint16_t error_code;    /* private */
93637 +    uint16_t entry_vector;  /* private */
93638 +    uint32_t eip;
93639 +    uint16_t cs;
93640 +    uint8_t  saved_upcall_mask;
93641 +    uint8_t  _pad0;
93642 +    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
93643 +    uint32_t esp;
93644 +    uint16_t ss, _pad1;
93645 +    uint16_t es, _pad2;
93646 +    uint16_t ds, _pad3;
93647 +    uint16_t fs, _pad4;
93648 +    uint16_t gs, _pad5;
93649 +};
93650 +typedef struct cpu_user_regs cpu_user_regs_t;
93651 +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
93652 +
93653 +/*
93654 + * Page-directory addresses above 4GB do not fit into architectural %cr3.
93655 + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
93656 + * must use the following accessor macros to pack/unpack valid MFNs.
93657 + */
93658 +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
93659 +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
93660 +
93661 +struct arch_vcpu_info {
93662 +    unsigned long cr2;
93663 +    unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
93664 +};
93665 +typedef struct arch_vcpu_info arch_vcpu_info_t;
93666 +
93667 +struct xen_callback {
93668 +    unsigned long cs;
93669 +    unsigned long eip;
93670 +};
93671 +typedef struct xen_callback xen_callback_t;
93672 +
93673 +#endif /* !__ASSEMBLY__ */
93674 +
93675 +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */
93676 +
93677 +/*
93678 + * Local variables:
93679 + * mode: C
93680 + * c-set-style: "BSD"
93681 + * c-basic-offset: 4
93682 + * tab-width: 4
93683 + * indent-tabs-mode: nil
93684 + * End:
93685 + */
93686 diff -ruNp linux-2.6.19/include/xen/interface/arch-x86/xen-x86_64.h linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86/xen-x86_64.h
93687 --- linux-2.6.19/include/xen/interface/arch-x86/xen-x86_64.h    1970-01-01 00:00:00.000000000 +0000
93688 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86/xen-x86_64.h  2007-02-02 19:11:00.000000000 +0000
93689 @@ -0,0 +1,208 @@
93690 +/******************************************************************************
93691 + * xen-x86_64.h
93692 + * 
93693 + * Guest OS interface to x86 64-bit Xen.
93694 + * 
93695 + * Permission is hereby granted, free of charge, to any person obtaining a copy
93696 + * of this software and associated documentation files (the "Software"), to
93697 + * deal in the Software without restriction, including without limitation the
93698 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
93699 + * sell copies of the Software, and to permit persons to whom the Software is
93700 + * furnished to do so, subject to the following conditions:
93701 + *
93702 + * The above copyright notice and this permission notice shall be included in
93703 + * all copies or substantial portions of the Software.
93704 + *
93705 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
93706 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
93707 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
93708 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
93709 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
93710 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
93711 + * DEALINGS IN THE SOFTWARE.
93712 + *
93713 + * Copyright (c) 2004-2006, K A Fraser
93714 + */
93715 +
93716 +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
93717 +#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
93718 +
93719 +/*
93720 + * Hypercall interface:
93721 + *  Input:  %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5)
93722 + *  Output: %rax
93723 + * Access is via hypercall page (set up by guest loader or via a Xen MSR):
93724 + *  call hypercall_page + hypercall-number * 32
93725 + * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi)
93726 + */
93727 +
93728 +#if __XEN_INTERFACE_VERSION__ < 0x00030203
93729 +/*
93730 + * Legacy hypercall interface:
93731 + * As above, except the entry sequence to the hypervisor is:
93732 + *  mov $hypercall-number*32,%eax ; syscall
93733 + * Clobbered: %rcx, %r11, argument registers (as above)
93734 + */
93735 +#define TRAP_INSTR "syscall"
93736 +#endif
93737 +
93738 +/*
93739 + * 64-bit segment selectors
93740 + * These flat segments are in the Xen-private section of every GDT. Since these
93741 + * are also present in the initial GDT, many OSes will be able to avoid
93742 + * installing their own GDT.
93743 + */
93744 +
93745 +#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
93746 +#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
93747 +#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
93748 +#define FLAT_RING3_DS64 0x0000  /* NULL selector */
93749 +#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
93750 +#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
93751 +
93752 +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
93753 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
93754 +#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
93755 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
93756 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
93757 +#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
93758 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
93759 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
93760 +#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
93761 +
93762 +#define FLAT_USER_DS64 FLAT_RING3_DS64
93763 +#define FLAT_USER_DS32 FLAT_RING3_DS32
93764 +#define FLAT_USER_DS   FLAT_USER_DS64
93765 +#define FLAT_USER_CS64 FLAT_RING3_CS64
93766 +#define FLAT_USER_CS32 FLAT_RING3_CS32
93767 +#define FLAT_USER_CS   FLAT_USER_CS64
93768 +#define FLAT_USER_SS64 FLAT_RING3_SS64
93769 +#define FLAT_USER_SS32 FLAT_RING3_SS32
93770 +#define FLAT_USER_SS   FLAT_USER_SS64
93771 +
93772 +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
93773 +#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
93774 +#define __MACH2PHYS_VIRT_START  0xFFFF800000000000
93775 +#define __MACH2PHYS_VIRT_END    0xFFFF804000000000
93776 +
93777 +#ifndef HYPERVISOR_VIRT_START
93778 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
93779 +#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
93780 +#endif
93781 +
93782 +#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
93783 +#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
93784 +#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
93785 +#ifndef machine_to_phys_mapping
93786 +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
93787 +#endif
93788 +
93789 +#ifndef __ASSEMBLY__
93790 +
93791 +/*
93792 + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
93793 + *  @which == SEGBASE_*  ;  @base == 64-bit base address
93794 + * Returns 0 on success.
93795 + */
93796 +#define SEGBASE_FS          0
93797 +#define SEGBASE_GS_USER     1
93798 +#define SEGBASE_GS_KERNEL   2
93799 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
93800 +
93801 +/*
93802 + * int HYPERVISOR_iret(void)
93803 + * All arguments are on the kernel stack, in the following format.
93804 + * Never returns if successful. Current kernel context is lost.
93805 + * The saved CS is mapped as follows:
93806 + *   RING0 -> RING3 kernel mode.
93807 + *   RING1 -> RING3 kernel mode.
93808 + *   RING2 -> RING3 kernel mode.
93809 + *   RING3 -> RING3 user mode.
93810 + * However RING0 indicates that the guest kernel should return to iteself
93811 + * directly with
93812 + *      orb   $3,1*8(%rsp)
93813 + *      iretq
93814 + * If flags contains VGCF_in_syscall:
93815 + *   Restore RAX, RIP, RFLAGS, RSP.
93816 + *   Discard R11, RCX, CS, SS.
93817 + * Otherwise:
93818 + *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
93819 + * All other registers are saved on hypercall entry and restored to user.
93820 + */
93821 +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
93822 +#define _VGCF_in_syscall 8
93823 +#define VGCF_in_syscall  (1<<_VGCF_in_syscall)
93824 +#define VGCF_IN_SYSCALL  VGCF_in_syscall
93825 +struct iret_context {
93826 +    /* Top of stack (%rsp at point of hypercall). */
93827 +    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
93828 +    /* Bottom of iret stack frame. */
93829 +};
93830 +
93831 +#ifdef __GNUC__
93832 +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
93833 +#define __DECL_REG(name) union { uint64_t r ## name, e ## name; }
93834 +#else
93835 +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
93836 +#define __DECL_REG(name) uint64_t r ## name
93837 +#endif
93838 +
93839 +struct cpu_user_regs {
93840 +    uint64_t r15;
93841 +    uint64_t r14;
93842 +    uint64_t r13;
93843 +    uint64_t r12;
93844 +    __DECL_REG(bp);
93845 +    __DECL_REG(bx);
93846 +    uint64_t r11;
93847 +    uint64_t r10;
93848 +    uint64_t r9;
93849 +    uint64_t r8;
93850 +    __DECL_REG(ax);
93851 +    __DECL_REG(cx);
93852 +    __DECL_REG(dx);
93853 +    __DECL_REG(si);
93854 +    __DECL_REG(di);
93855 +    uint32_t error_code;    /* private */
93856 +    uint32_t entry_vector;  /* private */
93857 +    __DECL_REG(ip);
93858 +    uint16_t cs, _pad0[1];
93859 +    uint8_t  saved_upcall_mask;
93860 +    uint8_t  _pad1[3];
93861 +    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
93862 +    __DECL_REG(sp);
93863 +    uint16_t ss, _pad2[3];
93864 +    uint16_t es, _pad3[3];
93865 +    uint16_t ds, _pad4[3];
93866 +    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
93867 +    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
93868 +};
93869 +typedef struct cpu_user_regs cpu_user_regs_t;
93870 +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
93871 +
93872 +#undef __DECL_REG
93873 +
93874 +#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
93875 +#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
93876 +
93877 +struct arch_vcpu_info {
93878 +    unsigned long cr2;
93879 +    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
93880 +};
93881 +typedef struct arch_vcpu_info arch_vcpu_info_t;
93882 +
93883 +typedef unsigned long xen_callback_t;
93884 +
93885 +#endif /* !__ASSEMBLY__ */
93886 +
93887 +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */
93888 +
93889 +/*
93890 + * Local variables:
93891 + * mode: C
93892 + * c-set-style: "BSD"
93893 + * c-basic-offset: 4
93894 + * tab-width: 4
93895 + * indent-tabs-mode: nil
93896 + * End:
93897 + */
93898 diff -ruNp linux-2.6.19/include/xen/interface/arch-x86/xen.h linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86/xen.h
93899 --- linux-2.6.19/include/xen/interface/arch-x86/xen.h   1970-01-01 00:00:00.000000000 +0000
93900 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86/xen.h 2007-02-02 19:11:00.000000000 +0000
93901 @@ -0,0 +1,190 @@
93902 +/******************************************************************************
93903 + * arch-x86/xen.h
93904 + * 
93905 + * Guest OS interface to x86 Xen.
93906 + * 
93907 + * Permission is hereby granted, free of charge, to any person obtaining a copy
93908 + * of this software and associated documentation files (the "Software"), to
93909 + * deal in the Software without restriction, including without limitation the
93910 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
93911 + * sell copies of the Software, and to permit persons to whom the Software is
93912 + * furnished to do so, subject to the following conditions:
93913 + *
93914 + * The above copyright notice and this permission notice shall be included in
93915 + * all copies or substantial portions of the Software.
93916 + *
93917 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
93918 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
93919 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
93920 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
93921 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
93922 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
93923 + * DEALINGS IN THE SOFTWARE.
93924 + *
93925 + * Copyright (c) 2004-2006, K A Fraser
93926 + */
93927 +
93928 +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__
93929 +#define __XEN_PUBLIC_ARCH_X86_XEN_H__
93930 +
93931 +/* Structural guest handles introduced in 0x00030201. */
93932 +#if __XEN_INTERFACE_VERSION__ >= 0x00030201
93933 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
93934 +    typedef struct { type *p; } __guest_handle_ ## name
93935 +#else
93936 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
93937 +    typedef type * __guest_handle_ ## name
93938 +#endif
93939 +
93940 +#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
93941 +#define XEN_GUEST_HANDLE(name)          __guest_handle_ ## name
93942 +#define set_xen_guest_handle(hnd, val)  do { (hnd).p = val; } while (0)
93943 +#ifdef __XEN_TOOLS__
93944 +#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
93945 +#endif
93946 +
93947 +#ifndef __ASSEMBLY__
93948 +/* Guest handles for primitive C types. */
93949 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
93950 +__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
93951 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
93952 +DEFINE_XEN_GUEST_HANDLE(char);
93953 +DEFINE_XEN_GUEST_HANDLE(int);
93954 +DEFINE_XEN_GUEST_HANDLE(long);
93955 +DEFINE_XEN_GUEST_HANDLE(void);
93956 +
93957 +typedef unsigned long xen_pfn_t;
93958 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
93959 +#endif
93960 +
93961 +#if defined(__i386__)
93962 +#include "xen-x86_32.h"
93963 +#elif defined(__x86_64__)
93964 +#include "xen-x86_64.h"
93965 +#endif
93966 +
93967 +/*
93968 + * SEGMENT DESCRIPTOR TABLES
93969 + */
93970 +/*
93971 + * A number of GDT entries are reserved by Xen. These are not situated at the
93972 + * start of the GDT because some stupid OSes export hard-coded selector values
93973 + * in their ABI. These hard-coded values are always near the start of the GDT,
93974 + * so Xen places itself out of the way, at the far end of the GDT.
93975 + */
93976 +#define FIRST_RESERVED_GDT_PAGE  14
93977 +#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
93978 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
93979 +
93980 +/* Maximum number of virtual CPUs in multi-processor guests. */
93981 +#define MAX_VIRT_CPUS 32
93982 +
93983 +#ifndef __ASSEMBLY__
93984 +
93985 +typedef unsigned long xen_ulong_t;
93986 +
93987 +/*
93988 + * Send an array of these to HYPERVISOR_set_trap_table().
93989 + * The privilege level specifies which modes may enter a trap via a software
93990 + * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
93991 + * privilege levels as follows:
93992 + *  Level == 0: Noone may enter
93993 + *  Level == 1: Kernel may enter
93994 + *  Level == 2: Kernel may enter
93995 + *  Level == 3: Everyone may enter
93996 + */
93997 +#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
93998 +#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
93999 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
94000 +#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
94001 +struct trap_info {
94002 +    uint8_t       vector;  /* exception vector                              */
94003 +    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
94004 +    uint16_t      cs;      /* code selector                                 */
94005 +    unsigned long address; /* code offset                                   */
94006 +};
94007 +typedef struct trap_info trap_info_t;
94008 +DEFINE_XEN_GUEST_HANDLE(trap_info_t);
94009 +
94010 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
94011 +
94012 +/*
94013 + * The following is all CPU context. Note that the fpu_ctxt block is filled 
94014 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
94015 + */
94016 +struct vcpu_guest_context {
94017 +    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
94018 +    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
94019 +#define VGCF_I387_VALID                (1<<0)
94020 +#define VGCF_IN_KERNEL                 (1<<2)
94021 +#define _VGCF_i387_valid               0
94022 +#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
94023 +#define _VGCF_in_kernel                2
94024 +#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
94025 +#define _VGCF_failsafe_disables_events 3
94026 +#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
94027 +#define _VGCF_syscall_disables_events  4
94028 +#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
94029 +    unsigned long flags;                    /* VGCF_* flags                 */
94030 +    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
94031 +    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
94032 +    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
94033 +    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
94034 +    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
94035 +    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
94036 +    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
94037 +#ifdef __i386__
94038 +    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
94039 +    unsigned long event_callback_eip;
94040 +    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
94041 +    unsigned long failsafe_callback_eip;
94042 +#else
94043 +    unsigned long event_callback_eip;
94044 +    unsigned long failsafe_callback_eip;
94045 +    unsigned long syscall_callback_eip;
94046 +#endif
94047 +    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
94048 +#ifdef __x86_64__
94049 +    /* Segment base addresses. */
94050 +    uint64_t      fs_base;
94051 +    uint64_t      gs_base_kernel;
94052 +    uint64_t      gs_base_user;
94053 +#endif
94054 +};
94055 +typedef struct vcpu_guest_context vcpu_guest_context_t;
94056 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
94057 +
94058 +struct arch_shared_info {
94059 +    unsigned long max_pfn;                  /* max pfn that appears in table */
94060 +    /* Frame containing list of mfns containing list of mfns containing p2m. */
94061 +    xen_pfn_t     pfn_to_mfn_frame_list_list;
94062 +    unsigned long nmi_reason;
94063 +    uint64_t pad[32];
94064 +};
94065 +typedef struct arch_shared_info arch_shared_info_t;
94066 +
94067 +#endif /* !__ASSEMBLY__ */
94068 +
94069 +/*
94070 + * Prefix forces emulation of some non-trapping instructions.
94071 + * Currently only CPUID.
94072 + */
94073 +#ifdef __ASSEMBLY__
94074 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
94075 +#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
94076 +#else
94077 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
94078 +#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
94079 +#endif
94080 +
94081 +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */
94082 +
94083 +/*
94084 + * Local variables:
94085 + * mode: C
94086 + * c-set-style: "BSD"
94087 + * c-basic-offset: 4
94088 + * tab-width: 4
94089 + * indent-tabs-mode: nil
94090 + * End:
94091 + */
94092 diff -ruNp linux-2.6.19/include/xen/interface/arch-x86_32.h linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86_32.h
94093 --- linux-2.6.19/include/xen/interface/arch-x86_32.h    1970-01-01 00:00:00.000000000 +0000
94094 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86_32.h  2007-02-02 19:11:00.000000000 +0000
94095 @@ -0,0 +1,27 @@
94096 +/******************************************************************************
94097 + * arch-x86_32.h
94098 + * 
94099 + * Guest OS interface to x86 32-bit Xen.
94100 + * 
94101 + * Permission is hereby granted, free of charge, to any person obtaining a copy
94102 + * of this software and associated documentation files (the "Software"), to
94103 + * deal in the Software without restriction, including without limitation the
94104 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
94105 + * sell copies of the Software, and to permit persons to whom the Software is
94106 + * furnished to do so, subject to the following conditions:
94107 + *
94108 + * The above copyright notice and this permission notice shall be included in
94109 + * all copies or substantial portions of the Software.
94110 + *
94111 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94112 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94113 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94114 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94115 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94116 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
94117 + * DEALINGS IN THE SOFTWARE.
94118 + *
94119 + * Copyright (c) 2004-2006, K A Fraser
94120 + */
94121 +
94122 +#include "arch-x86/xen.h"
94123 diff -ruNp linux-2.6.19/include/xen/interface/arch-x86_64.h linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86_64.h
94124 --- linux-2.6.19/include/xen/interface/arch-x86_64.h    1970-01-01 00:00:00.000000000 +0000
94125 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/arch-x86_64.h  2007-02-02 19:11:00.000000000 +0000
94126 @@ -0,0 +1,27 @@
94127 +/******************************************************************************
94128 + * arch-x86_64.h
94129 + * 
94130 + * Guest OS interface to x86 64-bit Xen.
94131 + * 
94132 + * Permission is hereby granted, free of charge, to any person obtaining a copy
94133 + * of this software and associated documentation files (the "Software"), to
94134 + * deal in the Software without restriction, including without limitation the
94135 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
94136 + * sell copies of the Software, and to permit persons to whom the Software is
94137 + * furnished to do so, subject to the following conditions:
94138 + *
94139 + * The above copyright notice and this permission notice shall be included in
94140 + * all copies or substantial portions of the Software.
94141 + *
94142 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94143 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94144 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94145 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94146 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94147 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
94148 + * DEALINGS IN THE SOFTWARE.
94149 + *
94150 + * Copyright (c) 2004-2006, K A Fraser
94151 + */
94152 +
94153 +#include "arch-x86/xen.h"
94154 diff -ruNp linux-2.6.19/include/xen/interface/callback.h linux-2.6.19-xen-3.0.4/include/xen/interface/callback.h
94155 --- linux-2.6.19/include/xen/interface/callback.h       1970-01-01 00:00:00.000000000 +0000
94156 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/callback.h     2007-02-02 19:11:00.000000000 +0000
94157 @@ -0,0 +1,92 @@
94158 +/******************************************************************************
94159 + * callback.h
94160 + *
94161 + * Register guest OS callbacks with Xen.
94162 + *
94163 + * Permission is hereby granted, free of charge, to any person obtaining a copy
94164 + * of this software and associated documentation files (the "Software"), to
94165 + * deal in the Software without restriction, including without limitation the
94166 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
94167 + * sell copies of the Software, and to permit persons to whom the Software is
94168 + * furnished to do so, subject to the following conditions:
94169 + *
94170 + * The above copyright notice and this permission notice shall be included in
94171 + * all copies or substantial portions of the Software.
94172 + *
94173 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94174 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94175 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94176 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94177 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94178 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
94179 + * DEALINGS IN THE SOFTWARE.
94180 + *
94181 + * Copyright (c) 2006, Ian Campbell
94182 + */
94183 +
94184 +#ifndef __XEN_PUBLIC_CALLBACK_H__
94185 +#define __XEN_PUBLIC_CALLBACK_H__
94186 +
94187 +#include "xen.h"
94188 +
94189 +/*
94190 + * Prototype for this hypercall is:
94191 + *   long callback_op(int cmd, void *extra_args)
94192 + * @cmd        == CALLBACKOP_??? (callback operation).
94193 + * @extra_args == Operation-specific extra arguments (NULL if none).
94194 + */
94195 +
94196 +#define CALLBACKTYPE_event                 0
94197 +#define CALLBACKTYPE_failsafe              1
94198 +#define CALLBACKTYPE_syscall               2 /* x86_64 only */
94199 +/*
94200 + * sysenter is only available on x86_32 with the
94201 + * supervisor_mode_kernel option enabled.
94202 + */
94203 +#define CALLBACKTYPE_sysenter              3
94204 +#define CALLBACKTYPE_nmi                   4
94205 +
94206 +/*
94207 + * Disable event deliver during callback? This flag is ignored for event and
94208 + * NMI callbacks: event delivery is unconditionally disabled.
94209 + */
94210 +#define _CALLBACKF_mask_events             0
94211 +#define CALLBACKF_mask_events              (1U << _CALLBACKF_mask_events)
94212 +
94213 +/*
94214 + * Register a callback.
94215 + */
94216 +#define CALLBACKOP_register                0
94217 +struct callback_register {
94218 +    uint16_t type;
94219 +    uint16_t flags;
94220 +    xen_callback_t address;
94221 +};
94222 +typedef struct callback_register callback_register_t;
94223 +DEFINE_XEN_GUEST_HANDLE(callback_register_t);
94224 +
94225 +/*
94226 + * Unregister a callback.
94227 + *
94228 + * Not all callbacks can be unregistered. -EINVAL will be returned if
94229 + * you attempt to unregister such a callback.
94230 + */
94231 +#define CALLBACKOP_unregister              1
94232 +struct callback_unregister {
94233 +    uint16_t type;
94234 +    uint16_t _unused;
94235 +};
94236 +typedef struct callback_unregister callback_unregister_t;
94237 +DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
94238 +
94239 +#endif /* __XEN_PUBLIC_CALLBACK_H__ */
94240 +
94241 +/*
94242 + * Local variables:
94243 + * mode: C
94244 + * c-set-style: "BSD"
94245 + * c-basic-offset: 4
94246 + * tab-width: 4
94247 + * indent-tabs-mode: nil
94248 + * End:
94249 + */
94250 diff -ruNp linux-2.6.19/include/xen/interface/dom0_ops.h linux-2.6.19-xen-3.0.4/include/xen/interface/dom0_ops.h
94251 --- linux-2.6.19/include/xen/interface/dom0_ops.h       1970-01-01 00:00:00.000000000 +0000
94252 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/dom0_ops.h     2007-02-02 19:11:00.000000000 +0000
94253 @@ -0,0 +1,120 @@
94254 +/******************************************************************************
94255 + * dom0_ops.h
94256 + * 
94257 + * Process command requests from domain-0 guest OS.
94258 + * 
94259 + * Permission is hereby granted, free of charge, to any person obtaining a copy
94260 + * of this software and associated documentation files (the "Software"), to
94261 + * deal in the Software without restriction, including without limitation the
94262 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
94263 + * sell copies of the Software, and to permit persons to whom the Software is
94264 + * furnished to do so, subject to the following conditions:
94265 + *
94266 + * The above copyright notice and this permission notice shall be included in
94267 + * all copies or substantial portions of the Software.
94268 + *
94269 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94270 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94271 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94272 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94273 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94274 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
94275 + * DEALINGS IN THE SOFTWARE.
94276 + *
94277 + * Copyright (c) 2002-2003, B Dragovic
94278 + * Copyright (c) 2002-2006, K Fraser
94279 + */
94280 +
94281 +#ifndef __XEN_PUBLIC_DOM0_OPS_H__
94282 +#define __XEN_PUBLIC_DOM0_OPS_H__
94283 +
94284 +#include "xen.h"
94285 +#include "platform.h"
94286 +
94287 +#if __XEN_INTERFACE_VERSION__ >= 0x00030204
94288 +#error "dom0_ops.h is a compatibility interface only"
94289 +#endif
94290 +
94291 +#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION
94292 +
94293 +#define DOM0_SETTIME          XENPF_settime
94294 +#define dom0_settime          xenpf_settime
94295 +#define dom0_settime_t        xenpf_settime_t
94296 +
94297 +#define DOM0_ADD_MEMTYPE      XENPF_add_memtype
94298 +#define dom0_add_memtype      xenpf_add_memtype
94299 +#define dom0_add_memtype_t    xenpf_add_memtype_t
94300 +
94301 +#define DOM0_DEL_MEMTYPE      XENPF_del_memtype
94302 +#define dom0_del_memtype      xenpf_del_memtype
94303 +#define dom0_del_memtype_t    xenpf_del_memtype_t
94304 +
94305 +#define DOM0_READ_MEMTYPE     XENPF_read_memtype
94306 +#define dom0_read_memtype     xenpf_read_memtype
94307 +#define dom0_read_memtype_t   xenpf_read_memtype_t
94308 +
94309 +#define DOM0_MICROCODE        XENPF_microcode_update
94310 +#define dom0_microcode        xenpf_microcode_update
94311 +#define dom0_microcode_t      xenpf_microcode_update_t
94312 +
94313 +#define DOM0_PLATFORM_QUIRK   XENPF_platform_quirk
94314 +#define dom0_platform_quirk   xenpf_platform_quirk
94315 +#define dom0_platform_quirk_t xenpf_platform_quirk_t
94316 +
94317 +typedef uint64_t cpumap_t;
94318 +
94319 +/* Unsupported legacy operation -- defined for API compatibility. */
94320 +#define DOM0_MSR                 15
94321 +struct dom0_msr {
94322 +    /* IN variables. */
94323 +    uint32_t write;
94324 +    cpumap_t cpu_mask;
94325 +    uint32_t msr;
94326 +    uint32_t in1;
94327 +    uint32_t in2;
94328 +    /* OUT variables. */
94329 +    uint32_t out1;
94330 +    uint32_t out2;
94331 +};
94332 +typedef struct dom0_msr dom0_msr_t;
94333 +DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
94334 +
94335 +/* Unsupported legacy operation -- defined for API compatibility. */
94336 +#define DOM0_PHYSICAL_MEMORY_MAP 40
94337 +struct dom0_memory_map_entry {
94338 +    uint64_t start, end;
94339 +    uint32_t flags; /* reserved */
94340 +    uint8_t  is_ram;
94341 +};
94342 +typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
94343 +DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
94344 +
94345 +struct dom0_op {
94346 +    uint32_t cmd;
94347 +    uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
94348 +    union {
94349 +        struct dom0_msr               msr;
94350 +        struct dom0_settime           settime;
94351 +        struct dom0_add_memtype       add_memtype;
94352 +        struct dom0_del_memtype       del_memtype;
94353 +        struct dom0_read_memtype      read_memtype;
94354 +        struct dom0_microcode         microcode;
94355 +        struct dom0_platform_quirk    platform_quirk;
94356 +        struct dom0_memory_map_entry  physical_memory_map;
94357 +        uint8_t                       pad[128];
94358 +    } u;
94359 +};
94360 +typedef struct dom0_op dom0_op_t;
94361 +DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
94362 +
94363 +#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
94364 +
94365 +/*
94366 + * Local variables:
94367 + * mode: C
94368 + * c-set-style: "BSD"
94369 + * c-basic-offset: 4
94370 + * tab-width: 4
94371 + * indent-tabs-mode: nil
94372 + * End:
94373 + */
94374 diff -ruNp linux-2.6.19/include/xen/interface/domctl.h linux-2.6.19-xen-3.0.4/include/xen/interface/domctl.h
94375 --- linux-2.6.19/include/xen/interface/domctl.h 1970-01-01 00:00:00.000000000 +0000
94376 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/domctl.h       2007-02-02 19:11:00.000000000 +0000
94377 @@ -0,0 +1,437 @@
94378 +/******************************************************************************
94379 + * domctl.h
94380 + * 
94381 + * Domain management operations. For use by node control stack.
94382 + * 
94383 + * Permission is hereby granted, free of charge, to any person obtaining a copy
94384 + * of this software and associated documentation files (the "Software"), to
94385 + * deal in the Software without restriction, including without limitation the
94386 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
94387 + * sell copies of the Software, and to permit persons to whom the Software is
94388 + * furnished to do so, subject to the following conditions:
94389 + *
94390 + * The above copyright notice and this permission notice shall be included in
94391 + * all copies or substantial portions of the Software.
94392 + *
94393 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94394 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94395 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94396 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94397 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94398 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
94399 + * DEALINGS IN THE SOFTWARE.
94400 + *
94401 + * Copyright (c) 2002-2003, B Dragovic
94402 + * Copyright (c) 2002-2006, K Fraser
94403 + */
94404 +
94405 +#ifndef __XEN_PUBLIC_DOMCTL_H__
94406 +#define __XEN_PUBLIC_DOMCTL_H__
94407 +
94408 +#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
94409 +#error "domctl operations are intended for use by node control tools only"
94410 +#endif
94411 +
94412 +#include "xen.h"
94413 +
94414 +#define XEN_DOMCTL_INTERFACE_VERSION 0x00000004
94415 +
94416 +struct xenctl_cpumap {
94417 +    XEN_GUEST_HANDLE(uint8_t) bitmap;
94418 +    uint32_t nr_cpus;
94419 +};
94420 +
94421 +/*
94422 + * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
94423 + * If it is specified as zero, an id is auto-allocated and returned.
94424 + */
94425 +#define XEN_DOMCTL_createdomain       1
94426 +struct xen_domctl_createdomain {
94427 +    /* IN parameters */
94428 +    uint32_t ssidref;
94429 +    xen_domain_handle_t handle;
94430 + /* Is this an HVM guest (as opposed to a PV guest)? */
94431 +#define _XEN_DOMCTL_CDF_hvm_guest 0
94432 +#define XEN_DOMCTL_CDF_hvm_guest  (1U<<_XEN_DOMCTL_CDF_hvm_guest)
94433 +    uint32_t flags;
94434 +};
94435 +typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
94436 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
94437 +
94438 +#define XEN_DOMCTL_destroydomain      2
94439 +#define XEN_DOMCTL_pausedomain        3
94440 +#define XEN_DOMCTL_unpausedomain      4
94441 +
94442 +#define XEN_DOMCTL_getdomaininfo      5
94443 +struct xen_domctl_getdomaininfo {
94444 +    /* OUT variables. */
94445 +    domid_t  domain;              /* Also echoed in domctl.domain */
94446 + /* Domain is scheduled to die. */
94447 +#define _XEN_DOMINF_dying     0
94448 +#define XEN_DOMINF_dying      (1U<<_XEN_DOMINF_dying)
94449 + /* Domain is an HVM guest (as opposed to a PV guest). */
94450 +#define _XEN_DOMINF_hvm_guest 1
94451 +#define XEN_DOMINF_hvm_guest  (1U<<_XEN_DOMINF_hvm_guest)
94452 + /* The guest OS has shut down. */
94453 +#define _XEN_DOMINF_shutdown  2
94454 +#define XEN_DOMINF_shutdown   (1U<<_XEN_DOMINF_shutdown)
94455 + /* Currently paused by control software. */
94456 +#define _XEN_DOMINF_paused    3
94457 +#define XEN_DOMINF_paused     (1U<<_XEN_DOMINF_paused)
94458 + /* Currently blocked pending an event.     */
94459 +#define _XEN_DOMINF_blocked   4
94460 +#define XEN_DOMINF_blocked    (1U<<_XEN_DOMINF_blocked)
94461 + /* Domain is currently running.            */
94462 +#define _XEN_DOMINF_running   5
94463 +#define XEN_DOMINF_running    (1U<<_XEN_DOMINF_running)
94464 + /* CPU to which this domain is bound.      */
94465 +#define XEN_DOMINF_cpumask      255
94466 +#define XEN_DOMINF_cpushift       8
94467 + /* XEN_DOMINF_shutdown guest-supplied code.  */
94468 +#define XEN_DOMINF_shutdownmask 255
94469 +#define XEN_DOMINF_shutdownshift 16
94470 +    uint32_t flags;              /* XEN_DOMINF_* */
94471 +    uint64_t tot_pages;
94472 +    uint64_t max_pages;
94473 +    uint64_t shared_info_frame;  /* GMFN of shared_info struct */
94474 +    uint64_t cpu_time;
94475 +    uint32_t nr_online_vcpus;    /* Number of VCPUs currently online. */
94476 +    uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
94477 +    uint32_t ssidref;
94478 +    xen_domain_handle_t handle;
94479 +};
94480 +typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
94481 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
94482 +
94483 +
94484 +#define XEN_DOMCTL_getmemlist         6
94485 +struct xen_domctl_getmemlist {
94486 +    /* IN variables. */
94487 +    /* Max entries to write to output buffer. */
94488 +    uint64_t max_pfns;
94489 +    /* Start index in guest's page list. */
94490 +    uint64_t start_pfn;
94491 +    XEN_GUEST_HANDLE(xen_pfn_t) buffer;
94492 +    /* OUT variables. */
94493 +    uint64_t num_pfns;
94494 +};
94495 +typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t;
94496 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
94497 +
94498 +
94499 +#define XEN_DOMCTL_getpageframeinfo   7
94500 +
94501 +#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28
94502 +#define XEN_DOMCTL_PFINFO_NOTAB   (0x0<<28)
94503 +#define XEN_DOMCTL_PFINFO_L1TAB   (0x1<<28)
94504 +#define XEN_DOMCTL_PFINFO_L2TAB   (0x2<<28)
94505 +#define XEN_DOMCTL_PFINFO_L3TAB   (0x3<<28)
94506 +#define XEN_DOMCTL_PFINFO_L4TAB   (0x4<<28)
94507 +#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7<<28)
94508 +#define XEN_DOMCTL_PFINFO_LPINTAB (0x1<<31)
94509 +#define XEN_DOMCTL_PFINFO_XTAB    (0xf<<28) /* invalid page */
94510 +#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xf<<28)
94511 +
94512 +struct xen_domctl_getpageframeinfo {
94513 +    /* IN variables. */
94514 +    uint64_t gmfn;        /* GMFN to query */
94515 +    /* OUT variables. */
94516 +    /* Is the page PINNED to a type? */
94517 +    uint32_t type;         /* see above type defs */
94518 +};
94519 +typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
94520 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
94521 +
94522 +
94523 +#define XEN_DOMCTL_getpageframeinfo2  8
94524 +struct xen_domctl_getpageframeinfo2 {
94525 +    /* IN variables. */
94526 +    uint64_t num;
94527 +    /* IN/OUT variables. */
94528 +    XEN_GUEST_HANDLE(ulong) array;
94529 +};
94530 +typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
94531 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
94532 +
94533 +
94534 +/*
94535 + * Control shadow pagetables operation
94536 + */
94537 +#define XEN_DOMCTL_shadow_op         10
94538 +
94539 +/* Disable shadow mode. */
94540 +#define XEN_DOMCTL_SHADOW_OP_OFF         0
94541 +
94542 +/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */
94543 +#define XEN_DOMCTL_SHADOW_OP_ENABLE      32
94544 +
94545 +/* Log-dirty bitmap operations. */
94546 + /* Return the bitmap and clean internal copy for next round. */
94547 +#define XEN_DOMCTL_SHADOW_OP_CLEAN       11
94548 + /* Return the bitmap but do not modify internal copy. */
94549 +#define XEN_DOMCTL_SHADOW_OP_PEEK        12
94550 +
94551 +/* Memory allocation accessors. */
94552 +#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION   30
94553 +#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION   31
94554 +
94555 +/* Legacy enable operations. */
94556 + /* Equiv. to ENABLE with no mode flags. */
94557 +#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST       1
94558 + /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */
94559 +#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY   2
94560 + /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */
94561 +#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE  3
94562 +
94563 +/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */
94564 + /*
94565 +  * Shadow pagetables are refcounted: guest does not use explicit mmu
94566 +  * operations nor write-protect its pagetables.
94567 +  */
94568 +#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT  (1 << 1)
94569 + /*
94570 +  * Log pages in a bitmap as they are dirtied.
94571 +  * Used for live relocation to determine which pages must be re-sent.
94572 +  */
94573 +#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2)
94574 + /*
94575 +  * Automatically translate GPFNs into MFNs.
94576 +  */
94577 +#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3)
94578 + /*
94579 +  * Xen does not steal virtual address space from the guest.
94580 +  * Requires HVM support.
94581 +  */
94582 +#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL  (1 << 4)
94583 +
94584 +struct xen_domctl_shadow_op_stats {
94585 +    uint32_t fault_count;
94586 +    uint32_t dirty_count;
94587 +};
94588 +typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t;
94589 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t);
94590 +
94591 +struct xen_domctl_shadow_op {
94592 +    /* IN variables. */
94593 +    uint32_t       op;       /* XEN_DOMCTL_SHADOW_OP_* */
94594 +
94595 +    /* OP_ENABLE */
94596 +    uint32_t       mode;     /* XEN_DOMCTL_SHADOW_ENABLE_* */
94597 +
94598 +    /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */
94599 +    uint32_t       mb;       /* Shadow memory allocation in MB */
94600 +
94601 +    /* OP_PEEK / OP_CLEAN */
94602 +    XEN_GUEST_HANDLE(ulong) dirty_bitmap;
94603 +    uint64_t       pages;    /* Size of buffer. Updated with actual size. */
94604 +    struct xen_domctl_shadow_op_stats stats;
94605 +};
94606 +typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t;
94607 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t);
94608 +
94609 +
94610 +#define XEN_DOMCTL_max_mem           11
94611 +struct xen_domctl_max_mem {
94612 +    /* IN variables. */
94613 +    uint64_t max_memkb;
94614 +};
94615 +typedef struct xen_domctl_max_mem xen_domctl_max_mem_t;
94616 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t);
94617 +
94618 +
94619 +#define XEN_DOMCTL_setvcpucontext    12
94620 +#define XEN_DOMCTL_getvcpucontext    13
94621 +struct xen_domctl_vcpucontext {
94622 +    uint32_t              vcpu;                  /* IN */
94623 +    XEN_GUEST_HANDLE(vcpu_guest_context_t) ctxt; /* IN/OUT */
94624 +};
94625 +typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t;
94626 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t);
94627 +
94628 +
94629 +#define XEN_DOMCTL_getvcpuinfo       14
94630 +struct xen_domctl_getvcpuinfo {
94631 +    /* IN variables. */
94632 +    uint32_t vcpu;
94633 +    /* OUT variables. */
94634 +    uint8_t  online;                  /* currently online (not hotplugged)? */
94635 +    uint8_t  blocked;                 /* blocked waiting for an event? */
94636 +    uint8_t  running;                 /* currently scheduled on its CPU? */
94637 +    uint64_t cpu_time;                /* total cpu time consumed (ns) */
94638 +    uint32_t cpu;                     /* current mapping   */
94639 +};
94640 +typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t;
94641 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
94642 +
94643 +
94644 +/* Get/set which physical cpus a vcpu can execute on. */
94645 +#define XEN_DOMCTL_setvcpuaffinity    9
94646 +#define XEN_DOMCTL_getvcpuaffinity   25
94647 +struct xen_domctl_vcpuaffinity {
94648 +    uint32_t  vcpu;              /* IN */
94649 +    struct xenctl_cpumap cpumap; /* IN/OUT */
94650 +};
94651 +typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
94652 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
94653 +
94654 +
94655 +#define XEN_DOMCTL_max_vcpus         15
94656 +struct xen_domctl_max_vcpus {
94657 +    uint32_t max;           /* maximum number of vcpus */
94658 +};
94659 +typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
94660 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
94661 +
94662 +
94663 +#define XEN_DOMCTL_scheduler_op      16
94664 +/* Scheduler types. */
94665 +#define XEN_SCHEDULER_SEDF     4
94666 +#define XEN_SCHEDULER_CREDIT   5
94667 +/* Set or get info? */
94668 +#define XEN_DOMCTL_SCHEDOP_putinfo 0
94669 +#define XEN_DOMCTL_SCHEDOP_getinfo 1
94670 +struct xen_domctl_scheduler_op {
94671 +    uint32_t sched_id;  /* XEN_SCHEDULER_* */
94672 +    uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
94673 +    union {
94674 +        struct xen_domctl_sched_sedf {
94675 +            uint64_t period;
94676 +            uint64_t slice;
94677 +            uint64_t latency;
94678 +            uint32_t extratime;
94679 +            uint32_t weight;
94680 +        } sedf;
94681 +        struct xen_domctl_sched_credit {
94682 +            uint16_t weight;
94683 +            uint16_t cap;
94684 +        } credit;
94685 +    } u;
94686 +};
94687 +typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
94688 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t);
94689 +
94690 +
94691 +#define XEN_DOMCTL_setdomainhandle   17
94692 +struct xen_domctl_setdomainhandle {
94693 +    xen_domain_handle_t handle;
94694 +};
94695 +typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t;
94696 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t);
94697 +
94698 +
94699 +#define XEN_DOMCTL_setdebugging      18
94700 +struct xen_domctl_setdebugging {
94701 +    uint8_t enable;
94702 +};
94703 +typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t;
94704 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t);
94705 +
94706 +
94707 +#define XEN_DOMCTL_irq_permission    19
94708 +struct xen_domctl_irq_permission {
94709 +    uint8_t pirq;
94710 +    uint8_t allow_access;    /* flag to specify enable/disable of IRQ access */
94711 +};
94712 +typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t;
94713 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t);
94714 +
94715 +
94716 +#define XEN_DOMCTL_iomem_permission  20
94717 +struct xen_domctl_iomem_permission {
94718 +    uint64_t first_mfn;       /* first page (physical page number) in range */
94719 +    uint64_t nr_mfns;         /* number of pages in range (>0) */
94720 +    uint8_t  allow_access;    /* allow (!0) or deny (0) access to range? */
94721 +};
94722 +typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
94723 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t);
94724 +
94725 +
94726 +#define XEN_DOMCTL_ioport_permission 21
94727 +struct xen_domctl_ioport_permission {
94728 +    uint32_t first_port;              /* first port int range */
94729 +    uint32_t nr_ports;                /* size of port range */
94730 +    uint8_t  allow_access;            /* allow or deny access to range? */
94731 +};
94732 +typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t;
94733 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t);
94734 +
94735 +#define XEN_DOMCTL_hypercall_init    22
94736 +struct xen_domctl_hypercall_init {
94737 +    uint64_t  gmfn;            /* GMFN to be initialised */
94738 +};
94739 +typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t;
94740 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
94741 +
94742 +#define XEN_DOMCTL_arch_setup        23
94743 +#define _XEN_DOMAINSETUP_hvm_guest 0
94744 +#define XEN_DOMAINSETUP_hvm_guest  (1UL<<_XEN_DOMAINSETUP_hvm_guest)
94745 +#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save)  */
94746 +#define XEN_DOMAINSETUP_query  (1UL<<_XEN_DOMAINSETUP_query)
94747 +typedef struct xen_domctl_arch_setup {
94748 +    uint64_t flags;      /* XEN_DOMAINSETUP_* */
94749 +#ifdef __ia64__
94750 +    uint64_t bp;            /* mpaddr of boot param area */
94751 +    uint64_t maxmem;        /* Highest memory address for MDT.  */
94752 +    uint64_t xsi_va;        /* Xen shared_info area virtual address.  */
94753 +    uint32_t hypercall_imm; /* Break imm for Xen hypercalls.  */
94754 +#endif
94755 +} xen_domctl_arch_setup_t;
94756 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
94757 +
94758 +#define XEN_DOMCTL_settimeoffset     24
94759 +struct xen_domctl_settimeoffset {
94760 +    int32_t  time_offset_seconds; /* applied to domain wallclock time */
94761 +};
94762 +typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
94763 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
94764 +
94765 +#define XEN_DOMCTL_real_mode_area     26
94766 +struct xen_domctl_real_mode_area {
94767 +    uint32_t log; /* log2 of Real Mode Area size */
94768 +};
94769 +typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
94770 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
94771 +
94772 +struct xen_domctl {
94773 +    uint32_t cmd;
94774 +    uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
94775 +    domid_t  domain;
94776 +    union {
94777 +        struct xen_domctl_createdomain      createdomain;
94778 +        struct xen_domctl_getdomaininfo     getdomaininfo;
94779 +        struct xen_domctl_getmemlist        getmemlist;
94780 +        struct xen_domctl_getpageframeinfo  getpageframeinfo;
94781 +        struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
94782 +        struct xen_domctl_vcpuaffinity      vcpuaffinity;
94783 +        struct xen_domctl_shadow_op         shadow_op;
94784 +        struct xen_domctl_max_mem           max_mem;
94785 +        struct xen_domctl_vcpucontext       vcpucontext;
94786 +        struct xen_domctl_getvcpuinfo       getvcpuinfo;
94787 +        struct xen_domctl_max_vcpus         max_vcpus;
94788 +        struct xen_domctl_scheduler_op      scheduler_op;
94789 +        struct xen_domctl_setdomainhandle   setdomainhandle;
94790 +        struct xen_domctl_setdebugging      setdebugging;
94791 +        struct xen_domctl_irq_permission    irq_permission;
94792 +        struct xen_domctl_iomem_permission  iomem_permission;
94793 +        struct xen_domctl_ioport_permission ioport_permission;
94794 +        struct xen_domctl_hypercall_init    hypercall_init;
94795 +        struct xen_domctl_arch_setup        arch_setup;
94796 +        struct xen_domctl_settimeoffset     settimeoffset;
94797 +        struct xen_domctl_real_mode_area    real_mode_area;
94798 +        uint8_t                             pad[128];
94799 +    } u;
94800 +};
94801 +typedef struct xen_domctl xen_domctl_t;
94802 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_t);
94803 +
94804 +#endif /* __XEN_PUBLIC_DOMCTL_H__ */
94805 +
94806 +/*
94807 + * Local variables:
94808 + * mode: C
94809 + * c-set-style: "BSD"
94810 + * c-basic-offset: 4
94811 + * tab-width: 4
94812 + * indent-tabs-mode: nil
94813 + * End:
94814 + */
94815 diff -ruNp linux-2.6.19/include/xen/interface/elfnote.h linux-2.6.19-xen-3.0.4/include/xen/interface/elfnote.h
94816 --- linux-2.6.19/include/xen/interface/elfnote.h        1970-01-01 00:00:00.000000000 +0000
94817 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/elfnote.h      2007-02-02 19:11:00.000000000 +0000
94818 @@ -0,0 +1,179 @@
94819 +/******************************************************************************
94820 + * elfnote.h
94821 + *
94822 + * Definitions used for the Xen ELF notes.
94823 + *
94824 + * Permission is hereby granted, free of charge, to any person obtaining a copy
94825 + * of this software and associated documentation files (the "Software"), to
94826 + * deal in the Software without restriction, including without limitation the
94827 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
94828 + * sell copies of the Software, and to permit persons to whom the Software is
94829 + * furnished to do so, subject to the following conditions:
94830 + *
94831 + * The above copyright notice and this permission notice shall be included in
94832 + * all copies or substantial portions of the Software.
94833 + *
94834 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94835 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94836 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94837 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94838 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94839 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
94840 + * DEALINGS IN THE SOFTWARE.
94841 + *
94842 + * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
94843 + */
94844 +
94845 +#ifndef __XEN_PUBLIC_ELFNOTE_H__
94846 +#define __XEN_PUBLIC_ELFNOTE_H__
94847 +
94848 +/*
94849 + * The notes should live in a SHT_NOTE segment and have "Xen" in the
94850 + * name field.
94851 + *
94852 + * Numeric types are either 4 or 8 bytes depending on the content of
94853 + * the desc field.
94854 + *
94855 + * LEGACY indicated the fields in the legacy __xen_guest string which
94856 + * this a note type replaces.
94857 + */
94858 +
94859 +/*
94860 + * NAME=VALUE pair (string).
94861 + *
94862 + * LEGACY: FEATURES and PAE
94863 + */
94864 +#define XEN_ELFNOTE_INFO           0
94865 +
94866 +/*
94867 + * The virtual address of the entry point (numeric).
94868 + *
94869 + * LEGACY: VIRT_ENTRY
94870 + */
94871 +#define XEN_ELFNOTE_ENTRY          1
94872 +
94873 +/* The virtual address of the hypercall transfer page (numeric).
94874 + *
94875 + * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page
94876 + * number not a virtual address)
94877 + */
94878 +#define XEN_ELFNOTE_HYPERCALL_PAGE 2
94879 +
94880 +/* The virtual address where the kernel image should be mapped (numeric).
94881 + *
94882 + * Defaults to 0.
94883 + *
94884 + * LEGACY: VIRT_BASE
94885 + */
94886 +#define XEN_ELFNOTE_VIRT_BASE      3
94887 +
94888 +/*
94889 + * The offset of the ELF paddr field from the acutal required
94890 + * psuedo-physical address (numeric).
94891 + *
94892 + * This is used to maintain backwards compatibility with older kernels
94893 + * which wrote __PAGE_OFFSET into that field. This field defaults to 0
94894 + * if not present.
94895 + *
94896 + * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE)
94897 + */
94898 +#define XEN_ELFNOTE_PADDR_OFFSET   4
94899 +
94900 +/*
94901 + * The version of Xen that we work with (string).
94902 + *
94903 + * LEGACY: XEN_VER
94904 + */
94905 +#define XEN_ELFNOTE_XEN_VERSION    5
94906 +
94907 +/*
94908 + * The name of the guest operating system (string).
94909 + *
94910 + * LEGACY: GUEST_OS
94911 + */
94912 +#define XEN_ELFNOTE_GUEST_OS       6
94913 +
94914 +/*
94915 + * The version of the guest operating system (string).
94916 + *
94917 + * LEGACY: GUEST_VER
94918 + */
94919 +#define XEN_ELFNOTE_GUEST_VERSION  7
94920 +
94921 +/*
94922 + * The loader type (string).
94923 + *
94924 + * LEGACY: LOADER
94925 + */
94926 +#define XEN_ELFNOTE_LOADER         8
94927 +
94928 +/*
94929 + * The kernel supports PAE (x86/32 only, string = "yes" or "no").
94930 + *
94931 + * LEGACY: PAE (n.b. The legacy interface included a provision to
94932 + * indicate 'extended-cr3' support allowing L3 page tables to be
94933 + * placed above 4G. It is assumed that any kernel new enough to use
94934 + * these ELF notes will include this and therefore "yes" here is
94935 + * equivalent to "yes[entended-cr3]" in the __xen_guest interface.
94936 + */
94937 +#define XEN_ELFNOTE_PAE_MODE       9
94938 +
94939 +/*
94940 + * The features supported/required by this kernel (string).
94941 + *
94942 + * The string must consist of a list of feature names (as given in
94943 + * features.h, without the "XENFEAT_" prefix) separated by '|'
94944 + * characters. If a feature is required for the kernel to function
94945 + * then the feature name must be preceded by a '!' character.
94946 + *
94947 + * LEGACY: FEATURES
94948 + */
94949 +#define XEN_ELFNOTE_FEATURES      10
94950 +
94951 +/*
94952 + * The kernel requires the symbol table to be loaded (string = "yes" or "no")
94953 + * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence
94954 + * of this string as a boolean flag rather than requiring "yes" or
94955 + * "no".
94956 + */
94957 +#define XEN_ELFNOTE_BSD_SYMTAB    11
94958 +
94959 +/*
94960 + * The lowest address the hypervisor hole can begin at (numeric).
94961 + *
94962 + * This must not be set higher than HYPERVISOR_VIRT_START. Its presence
94963 + * also indicates to the hypervisor that the kernel can deal with the
94964 + * hole starting at a higher address.
94965 + */
94966 +#define XEN_ELFNOTE_HV_START_LOW  12
94967 +
94968 +/*
94969 + * System information exported through crash notes.
94970 + *
94971 + * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO 
94972 + * note in case of a system crash. This note will contain various
94973 + * information about the system, see xen/include/xen/elfcore.h.
94974 + */
94975 +#define XEN_ELFNOTE_CRASH_INFO 0x1000001
94976 +
94977 +/*
94978 + * System registers exported through crash notes.
94979 + *
94980 + * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS 
94981 + * note per cpu in case of a system crash. This note is architecture
94982 + * specific and will contain registers not saved in the "CORE" note.
94983 + * See xen/include/xen/elfcore.h for more information.
94984 + */
94985 +#define XEN_ELFNOTE_CRASH_REGS 0x1000002
94986 +
94987 +#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
94988 +
94989 +/*
94990 + * Local variables:
94991 + * mode: C
94992 + * c-set-style: "BSD"
94993 + * c-basic-offset: 4
94994 + * tab-width: 4
94995 + * indent-tabs-mode: nil
94996 + * End:
94997 + */
94998 diff -ruNp linux-2.6.19/include/xen/interface/event_channel.h linux-2.6.19-xen-3.0.4/include/xen/interface/event_channel.h
94999 --- linux-2.6.19/include/xen/interface/event_channel.h  1970-01-01 00:00:00.000000000 +0000
95000 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/event_channel.h        2007-02-02 19:11:00.000000000 +0000
95001 @@ -0,0 +1,251 @@
95002 +/******************************************************************************
95003 + * event_channel.h
95004 + * 
95005 + * Event channels between domains.
95006 + * 
95007 + * Permission is hereby granted, free of charge, to any person obtaining a copy
95008 + * of this software and associated documentation files (the "Software"), to
95009 + * deal in the Software without restriction, including without limitation the
95010 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
95011 + * sell copies of the Software, and to permit persons to whom the Software is
95012 + * furnished to do so, subject to the following conditions:
95013 + *
95014 + * The above copyright notice and this permission notice shall be included in
95015 + * all copies or substantial portions of the Software.
95016 + *
95017 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
95018 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95019 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
95020 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95021 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
95022 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
95023 + * DEALINGS IN THE SOFTWARE.
95024 + *
95025 + * Copyright (c) 2003-2004, K A Fraser.
95026 + */
95027 +
95028 +#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
95029 +#define __XEN_PUBLIC_EVENT_CHANNEL_H__
95030 +
95031 +/*
95032 + * Prototype for this hypercall is:
95033 + *  int event_channel_op(int cmd, void *args)
95034 + * @cmd  == EVTCHNOP_??? (event-channel operation).
95035 + * @args == Operation-specific extra arguments (NULL if none).
95036 + */
95037 +
95038 +typedef uint32_t evtchn_port_t;
95039 +DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
95040 +
95041 +/*
95042 + * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
95043 + * accepting interdomain bindings from domain <remote_dom>. A fresh port
95044 + * is allocated in <dom> and returned as <port>.
95045 + * NOTES:
95046 + *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
95047 + *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
95048 + */
95049 +#define EVTCHNOP_alloc_unbound    6
95050 +struct evtchn_alloc_unbound {
95051 +    /* IN parameters */
95052 +    domid_t dom, remote_dom;
95053 +    /* OUT parameters */
95054 +    evtchn_port_t port;
95055 +};
95056 +typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
95057 +
95058 +/*
95059 + * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
95060 + * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
95061 + * a port that is unbound and marked as accepting bindings from the calling
95062 + * domain. A fresh port is allocated in the calling domain and returned as
95063 + * <local_port>.
95064 + * NOTES:
95065 + *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
95066 + */
95067 +#define EVTCHNOP_bind_interdomain 0
95068 +struct evtchn_bind_interdomain {
95069 +    /* IN parameters. */
95070 +    domid_t remote_dom;
95071 +    evtchn_port_t remote_port;
95072 +    /* OUT parameters. */
95073 +    evtchn_port_t local_port;
95074 +};
95075 +typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
95076 +
95077 +/*
95078 + * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
95079 + * vcpu.
95080 + * NOTES:
95081 + *  1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
95082 + *     in xen.h for the classification of each VIRQ.
95083 + *  2. Global VIRQs must be allocated on VCPU0 but can subsequently be
95084 + *     re-bound via EVTCHNOP_bind_vcpu.
95085 + *  3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
95086 + *     The allocated event channel is bound to the specified vcpu and the
95087 + *     binding cannot be changed.
95088 + */
95089 +#define EVTCHNOP_bind_virq        1
95090 +struct evtchn_bind_virq {
95091 +    /* IN parameters. */
95092 +    uint32_t virq;
95093 +    uint32_t vcpu;
95094 +    /* OUT parameters. */
95095 +    evtchn_port_t port;
95096 +};
95097 +typedef struct evtchn_bind_virq evtchn_bind_virq_t;
95098 +
95099 +/*
95100 + * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
95101 + * NOTES:
95102 + *  1. A physical IRQ may be bound to at most one event channel per domain.
95103 + *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
95104 + */
95105 +#define EVTCHNOP_bind_pirq        2
95106 +struct evtchn_bind_pirq {
95107 +    /* IN parameters. */
95108 +    uint32_t pirq;
95109 +#define BIND_PIRQ__WILL_SHARE 1
95110 +    uint32_t flags; /* BIND_PIRQ__* */
95111 +    /* OUT parameters. */
95112 +    evtchn_port_t port;
95113 +};
95114 +typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
95115 +
95116 +/*
95117 + * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
95118 + * NOTES:
95119 + *  1. The allocated event channel is bound to the specified vcpu. The binding
95120 + *     may not be changed.
95121 + */
95122 +#define EVTCHNOP_bind_ipi         7
95123 +struct evtchn_bind_ipi {
95124 +    uint32_t vcpu;
95125 +    /* OUT parameters. */
95126 +    evtchn_port_t port;
95127 +};
95128 +typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
95129 +
95130 +/*
95131 + * EVTCHNOP_close: Close a local event channel <port>. If the channel is
95132 + * interdomain then the remote end is placed in the unbound state
95133 + * (EVTCHNSTAT_unbound), awaiting a new connection.
95134 + */
95135 +#define EVTCHNOP_close            3
95136 +struct evtchn_close {
95137 +    /* IN parameters. */
95138 +    evtchn_port_t port;
95139 +};
95140 +typedef struct evtchn_close evtchn_close_t;
95141 +
95142 +/*
95143 + * EVTCHNOP_send: Send an event to the remote end of the channel whose local
95144 + * endpoint is <port>.
95145 + */
95146 +#define EVTCHNOP_send             4
95147 +struct evtchn_send {
95148 +    /* IN parameters. */
95149 +    evtchn_port_t port;
95150 +};
95151 +typedef struct evtchn_send evtchn_send_t;
95152 +
95153 +/*
95154 + * EVTCHNOP_status: Get the current status of the communication channel which
95155 + * has an endpoint at <dom, port>.
95156 + * NOTES:
95157 + *  1. <dom> may be specified as DOMID_SELF.
95158 + *  2. Only a sufficiently-privileged domain may obtain the status of an event
95159 + *     channel for which <dom> is not DOMID_SELF.
95160 + */
95161 +#define EVTCHNOP_status           5
95162 +struct evtchn_status {
95163 +    /* IN parameters */
95164 +    domid_t  dom;
95165 +    evtchn_port_t port;
95166 +    /* OUT parameters */
95167 +#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
95168 +#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
95169 +#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
95170 +#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
95171 +#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
95172 +#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
95173 +    uint32_t status;
95174 +    uint32_t vcpu;                 /* VCPU to which this channel is bound.   */
95175 +    union {
95176 +        struct {
95177 +            domid_t dom;
95178 +        } unbound; /* EVTCHNSTAT_unbound */
95179 +        struct {
95180 +            domid_t dom;
95181 +            evtchn_port_t port;
95182 +        } interdomain; /* EVTCHNSTAT_interdomain */
95183 +        uint32_t pirq;      /* EVTCHNSTAT_pirq        */
95184 +        uint32_t virq;      /* EVTCHNSTAT_virq        */
95185 +    } u;
95186 +};
95187 +typedef struct evtchn_status evtchn_status_t;
95188 +
95189 +/*
95190 + * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
95191 + * event is pending.
95192 + * NOTES:
95193 + *  1. IPI-bound channels always notify the vcpu specified at bind time.
95194 + *     This binding cannot be changed.
95195 + *  2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
95196 + *     This binding cannot be changed.
95197 + *  3. All other channels notify vcpu0 by default. This default is set when
95198 + *     the channel is allocated (a port that is freed and subsequently reused
95199 + *     has its binding reset to vcpu0).
95200 + */
95201 +#define EVTCHNOP_bind_vcpu        8
95202 +struct evtchn_bind_vcpu {
95203 +    /* IN parameters. */
95204 +    evtchn_port_t port;
95205 +    uint32_t vcpu;
95206 +};
95207 +typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
95208 +
95209 +/*
95210 + * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
95211 + * a notification to the appropriate VCPU if an event is pending.
95212 + */
95213 +#define EVTCHNOP_unmask           9
95214 +struct evtchn_unmask {
95215 +    /* IN parameters. */
95216 +    evtchn_port_t port;
95217 +};
95218 +typedef struct evtchn_unmask evtchn_unmask_t;
95219 +
95220 +/*
95221 + * Argument to event_channel_op_compat() hypercall. Superceded by new
95222 + * event_channel_op() hypercall since 0x00030202.
95223 + */
95224 +struct evtchn_op {
95225 +    uint32_t cmd; /* EVTCHNOP_* */
95226 +    union {
95227 +        struct evtchn_alloc_unbound    alloc_unbound;
95228 +        struct evtchn_bind_interdomain bind_interdomain;
95229 +        struct evtchn_bind_virq        bind_virq;
95230 +        struct evtchn_bind_pirq        bind_pirq;
95231 +        struct evtchn_bind_ipi         bind_ipi;
95232 +        struct evtchn_close            close;
95233 +        struct evtchn_send             send;
95234 +        struct evtchn_status           status;
95235 +        struct evtchn_bind_vcpu        bind_vcpu;
95236 +        struct evtchn_unmask           unmask;
95237 +    } u;
95238 +};
95239 +typedef struct evtchn_op evtchn_op_t;
95240 +DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
95241 +
95242 +#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
95243 +
95244 +/*
95245 + * Local variables:
95246 + * mode: C
95247 + * c-set-style: "BSD"
95248 + * c-basic-offset: 4
95249 + * tab-width: 4
95250 + * indent-tabs-mode: nil
95251 + * End:
95252 + */
95253 diff -ruNp linux-2.6.19/include/xen/interface/features.h linux-2.6.19-xen-3.0.4/include/xen/interface/features.h
95254 --- linux-2.6.19/include/xen/interface/features.h       1970-01-01 00:00:00.000000000 +0000
95255 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/features.h     2007-02-02 19:11:00.000000000 +0000
95256 @@ -0,0 +1,71 @@
95257 +/******************************************************************************
95258 + * features.h
95259 + * 
95260 + * Feature flags, reported by XENVER_get_features.
95261 + * 
95262 + * Permission is hereby granted, free of charge, to any person obtaining a copy
95263 + * of this software and associated documentation files (the "Software"), to
95264 + * deal in the Software without restriction, including without limitation the
95265 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
95266 + * sell copies of the Software, and to permit persons to whom the Software is
95267 + * furnished to do so, subject to the following conditions:
95268 + *
95269 + * The above copyright notice and this permission notice shall be included in
95270 + * all copies or substantial portions of the Software.
95271 + *
95272 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
95273 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95274 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
95275 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95276 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
95277 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
95278 + * DEALINGS IN THE SOFTWARE.
95279 + *
95280 + * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
95281 + */
95282 +
95283 +#ifndef __XEN_PUBLIC_FEATURES_H__
95284 +#define __XEN_PUBLIC_FEATURES_H__
95285 +
95286 +/*
95287 + * If set, the guest does not need to write-protect its pagetables, and can
95288 + * update them via direct writes.
95289 + */
95290 +#define XENFEAT_writable_page_tables       0
95291 +
95292 +/*
95293 + * If set, the guest does not need to write-protect its segment descriptor
95294 + * tables, and can update them via direct writes.
95295 + */
95296 +#define XENFEAT_writable_descriptor_tables 1
95297 +
95298 +/*
95299 + * If set, translation between the guest's 'pseudo-physical' address space
95300 + * and the host's machine address space are handled by the hypervisor. In this
95301 + * mode the guest does not need to perform phys-to/from-machine translations
95302 + * when performing page table operations.
95303 + */
95304 +#define XENFEAT_auto_translated_physmap    2
95305 +
95306 +/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
95307 +#define XENFEAT_supervisor_mode_kernel     3
95308 +
95309 +/*
95310 + * If set, the guest does not need to allocate x86 PAE page directories
95311 + * below 4GB. This flag is usually implied by auto_translated_physmap.
95312 + */
95313 +#define XENFEAT_pae_pgdir_above_4gb        4
95314 +
95315 +#define XENFEAT_NR_SUBMAPS 1
95316 +
95317 +#endif /* __XEN_PUBLIC_FEATURES_H__ */
95318 +
95319 +/*
95320 + * Local variables:
95321 + * mode: C
95322 + * c-set-style: "BSD"
95323 + * c-basic-offset: 4
95324 + * tab-width: 4
95325 + * indent-tabs-mode: nil
95326 + * End:
95327 + */
95328 diff -ruNp linux-2.6.19/include/xen/interface/grant_table.h linux-2.6.19-xen-3.0.4/include/xen/interface/grant_table.h
95329 --- linux-2.6.19/include/xen/interface/grant_table.h    1970-01-01 00:00:00.000000000 +0000
95330 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/grant_table.h  2007-02-02 19:11:00.000000000 +0000
95331 @@ -0,0 +1,380 @@
95332 +/******************************************************************************
95333 + * grant_table.h
95334 + * 
95335 + * Interface for granting foreign access to page frames, and receiving
95336 + * page-ownership transfers.
95337 + * 
95338 + * Permission is hereby granted, free of charge, to any person obtaining a copy
95339 + * of this software and associated documentation files (the "Software"), to
95340 + * deal in the Software without restriction, including without limitation the
95341 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
95342 + * sell copies of the Software, and to permit persons to whom the Software is
95343 + * furnished to do so, subject to the following conditions:
95344 + *
95345 + * The above copyright notice and this permission notice shall be included in
95346 + * all copies or substantial portions of the Software.
95347 + *
95348 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
95349 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95350 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
95351 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95352 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
95353 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
95354 + * DEALINGS IN THE SOFTWARE.
95355 + *
95356 + * Copyright (c) 2004, K A Fraser
95357 + */
95358 +
95359 +#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
95360 +#define __XEN_PUBLIC_GRANT_TABLE_H__
95361 +
95362 +
95363 +/***********************************
95364 + * GRANT TABLE REPRESENTATION
95365 + */
95366 +
95367 +/* Some rough guidelines on accessing and updating grant-table entries
95368 + * in a concurrency-safe manner. For more information, Linux contains a
95369 + * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
95370 + * 
95371 + * NB. WMB is a no-op on current-generation x86 processors. However, a
95372 + *     compiler barrier will still be required.
95373 + * 
95374 + * Introducing a valid entry into the grant table:
95375 + *  1. Write ent->domid.
95376 + *  2. Write ent->frame:
95377 + *      GTF_permit_access:   Frame to which access is permitted.
95378 + *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
95379 + *                           frame, or zero if none.
95380 + *  3. Write memory barrier (WMB).
95381 + *  4. Write ent->flags, inc. valid type.
95382 + * 
95383 + * Invalidating an unused GTF_permit_access entry:
95384 + *  1. flags = ent->flags.
95385 + *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
95386 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
95387 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
95388 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
95389 + *
95390 + * Invalidating an in-use GTF_permit_access entry:
95391 + *  This cannot be done directly. Request assistance from the domain controller
95392 + *  which can set a timeout on the use of a grant entry and take necessary
95393 + *  action. (NB. This is not yet implemented!).
95394 + * 
95395 + * Invalidating an unused GTF_accept_transfer entry:
95396 + *  1. flags = ent->flags.
95397 + *  2. Observe that !(flags & GTF_transfer_committed). [*]
95398 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
95399 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
95400 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
95401 + *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
95402 + *      The guest must /not/ modify the grant entry until the address of the
95403 + *      transferred frame is written. It is safe for the guest to spin waiting
95404 + *      for this to occur (detect by observing GTF_transfer_completed in
95405 + *      ent->flags).
95406 + *
95407 + * Invalidating a committed GTF_accept_transfer entry:
95408 + *  1. Wait for (ent->flags & GTF_transfer_completed).
95409 + *
95410 + * Changing a GTF_permit_access from writable to read-only:
95411 + *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
95412 + * 
95413 + * Changing a GTF_permit_access from read-only to writable:
95414 + *  Use SMP-safe bit-setting instruction.
95415 + */
95416 +
95417 +/*
95418 + * A grant table comprises a packed array of grant entries in one or more
95419 + * page frames shared between Xen and a guest.
95420 + * [XEN]: This field is written by Xen and read by the sharing guest.
95421 + * [GST]: This field is written by the guest and read by Xen.
95422 + */
95423 +struct grant_entry {
95424 +    /* GTF_xxx: various type and flag information.  [XEN,GST] */
95425 +    uint16_t flags;
95426 +    /* The domain being granted foreign privileges. [GST] */
95427 +    domid_t  domid;
95428 +    /*
95429 +     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
95430 +     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
95431 +     */
95432 +    uint32_t frame;
95433 +};
95434 +typedef struct grant_entry grant_entry_t;
95435 +
95436 +/*
95437 + * Type of grant entry.
95438 + *  GTF_invalid: This grant entry grants no privileges.
95439 + *  GTF_permit_access: Allow @domid to map/access @frame.
95440 + *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
95441 + *                       to this guest. Xen writes the page number to @frame.
95442 + */
95443 +#define GTF_invalid         (0U<<0)
95444 +#define GTF_permit_access   (1U<<0)
95445 +#define GTF_accept_transfer (2U<<0)
95446 +#define GTF_type_mask       (3U<<0)
95447 +
95448 +/*
95449 + * Subflags for GTF_permit_access.
95450 + *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
95451 + *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
95452 + *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
95453 + */
95454 +#define _GTF_readonly       (2)
95455 +#define GTF_readonly        (1U<<_GTF_readonly)
95456 +#define _GTF_reading        (3)
95457 +#define GTF_reading         (1U<<_GTF_reading)
95458 +#define _GTF_writing        (4)
95459 +#define GTF_writing         (1U<<_GTF_writing)
95460 +
95461 +/*
95462 + * Subflags for GTF_accept_transfer:
95463 + *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
95464 + *      to transferring ownership of a page frame. When a guest sees this flag
95465 + *      it must /not/ modify the grant entry until GTF_transfer_completed is
95466 + *      set by Xen.
95467 + *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
95468 + *      after reading GTF_transfer_committed. Xen will always write the frame
95469 + *      address, followed by ORing this flag, in a timely manner.
95470 + */
95471 +#define _GTF_transfer_committed (2)
95472 +#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
95473 +#define _GTF_transfer_completed (3)
95474 +#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
95475 +
95476 +
95477 +/***********************************
95478 + * GRANT TABLE QUERIES AND USES
95479 + */
95480 +
95481 +/*
95482 + * Reference to a grant entry in a specified domain's grant table.
95483 + */
95484 +typedef uint32_t grant_ref_t;
95485 +
95486 +/*
95487 + * Handle to track a mapping created via a grant reference.
95488 + */
95489 +typedef uint32_t grant_handle_t;
95490 +
95491 +/*
95492 + * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
95493 + * by devices and/or host CPUs. If successful, <handle> is a tracking number
95494 + * that must be presented later to destroy the mapping(s). On error, <handle>
95495 + * is a negative status code.
95496 + * NOTES:
95497 + *  1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
95498 + *     via which I/O devices may access the granted frame.
95499 + *  2. If GNTMAP_host_map is specified then a mapping will be added at
95500 + *     either a host virtual address in the current address space, or at
95501 + *     a PTE at the specified machine address.  The type of mapping to
95502 + *     perform is selected through the GNTMAP_contains_pte flag, and the 
95503 + *     address is specified in <host_addr>.
95504 + *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
95505 + *     host mapping is destroyed by other means then it is *NOT* guaranteed
95506 + *     to be accounted to the correct grant reference!
95507 + */
95508 +#define GNTTABOP_map_grant_ref        0
95509 +struct gnttab_map_grant_ref {
95510 +    /* IN parameters. */
95511 +    uint64_t host_addr;
95512 +    uint32_t flags;               /* GNTMAP_* */
95513 +    grant_ref_t ref;
95514 +    domid_t  dom;
95515 +    /* OUT parameters. */
95516 +    int16_t  status;              /* GNTST_* */
95517 +    grant_handle_t handle;
95518 +    uint64_t dev_bus_addr;
95519 +};
95520 +typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
95521 +DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
95522 +
95523 +/*
95524 + * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
95525 + * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
95526 + * field is ignored. If non-zero, they must refer to a device/host mapping
95527 + * that is tracked by <handle>
95528 + * NOTES:
95529 + *  1. The call may fail in an undefined manner if either mapping is not
95530 + *     tracked by <handle>.
95531 + *  3. After executing a batch of unmaps, it is guaranteed that no stale
95532 + *     mappings will remain in the device or host TLBs.
95533 + */
95534 +#define GNTTABOP_unmap_grant_ref      1
95535 +struct gnttab_unmap_grant_ref {
95536 +    /* IN parameters. */
95537 +    uint64_t host_addr;
95538 +    uint64_t dev_bus_addr;
95539 +    grant_handle_t handle;
95540 +    /* OUT parameters. */
95541 +    int16_t  status;              /* GNTST_* */
95542 +};
95543 +typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
95544 +DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
95545 +
95546 +/*
95547 + * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
95548 + * <nr_frames> pages. The frame addresses are written to the <frame_list>.
95549 + * Only <nr_frames> addresses are written, even if the table is larger.
95550 + * NOTES:
95551 + *  1. <dom> may be specified as DOMID_SELF.
95552 + *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
95553 + *  3. Xen may not support more than a single grant-table page per domain.
95554 + */
95555 +#define GNTTABOP_setup_table          2
95556 +struct gnttab_setup_table {
95557 +    /* IN parameters. */
95558 +    domid_t  dom;
95559 +    uint32_t nr_frames;
95560 +    /* OUT parameters. */
95561 +    int16_t  status;              /* GNTST_* */
95562 +    XEN_GUEST_HANDLE(ulong) frame_list;
95563 +};
95564 +typedef struct gnttab_setup_table gnttab_setup_table_t;
95565 +DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
95566 +
95567 +/*
95568 + * GNTTABOP_dump_table: Dump the contents of the grant table to the
95569 + * xen console. Debugging use only.
95570 + */
95571 +#define GNTTABOP_dump_table           3
95572 +struct gnttab_dump_table {
95573 +    /* IN parameters. */
95574 +    domid_t dom;
95575 +    /* OUT parameters. */
95576 +    int16_t status;               /* GNTST_* */
95577 +};
95578 +typedef struct gnttab_dump_table gnttab_dump_table_t;
95579 +DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
95580 +
95581 +/*
95582 + * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
95583 + * foreign domain has previously registered its interest in the transfer via
95584 + * <domid, ref>.
95585 + * 
95586 + * Note that, even if the transfer fails, the specified page no longer belongs
95587 + * to the calling domain *unless* the error is GNTST_bad_page.
95588 + */
95589 +#define GNTTABOP_transfer                4
95590 +struct gnttab_transfer {
95591 +    /* IN parameters. */
95592 +    xen_pfn_t     mfn;
95593 +    domid_t       domid;
95594 +    grant_ref_t   ref;
95595 +    /* OUT parameters. */
95596 +    int16_t       status;
95597 +};
95598 +typedef struct gnttab_transfer gnttab_transfer_t;
95599 +DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
95600 +
95601 +
95602 +/*
95603 + * GNTTABOP_copy: Hypervisor based copy
95604 + * source and destinations can be eithers MFNs or, for foreign domains,
95605 + * grant references. the foreign domain has to grant read/write access
95606 + * in its grant table.
95607 + *
95608 + * The flags specify what type source and destinations are (either MFN
95609 + * or grant reference).
95610 + *
95611 + * Note that this can also be used to copy data between two domains
95612 + * via a third party if the source and destination domains had previously
95613 + * grant appropriate access to their pages to the third party.
95614 + *
95615 + * source_offset specifies an offset in the source frame, dest_offset
95616 + * the offset in the target frame and  len specifies the number of
95617 + * bytes to be copied.
95618 + */
95619 +
95620 +#define _GNTCOPY_source_gref      (0)
95621 +#define GNTCOPY_source_gref       (1<<_GNTCOPY_source_gref)
95622 +#define _GNTCOPY_dest_gref        (1)
95623 +#define GNTCOPY_dest_gref         (1<<_GNTCOPY_dest_gref)
95624 +
95625 +#define GNTTABOP_copy                 5
95626 +typedef struct gnttab_copy {
95627 +    /* IN parameters. */
95628 +    struct {
95629 +        union {
95630 +            grant_ref_t ref;
95631 +            xen_pfn_t   gmfn;
95632 +        } u;
95633 +        domid_t  domid;
95634 +        uint16_t offset;
95635 +    } source, dest;
95636 +    uint16_t      len;
95637 +    uint16_t      flags;          /* GNTCOPY_* */
95638 +    /* OUT parameters. */
95639 +    int16_t       status;
95640 +} gnttab_copy_t;
95641 +DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
95642 +
95643 +
95644 +/*
95645 + * Bitfield values for update_pin_status.flags.
95646 + */
95647 + /* Map the grant entry for access by I/O devices. */
95648 +#define _GNTMAP_device_map      (0)
95649 +#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
95650 + /* Map the grant entry for access by host CPUs. */
95651 +#define _GNTMAP_host_map        (1)
95652 +#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
95653 + /* Accesses to the granted frame will be restricted to read-only access. */
95654 +#define _GNTMAP_readonly        (2)
95655 +#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
95656 + /*
95657 +  * GNTMAP_host_map subflag:
95658 +  *  0 => The host mapping is usable only by the guest OS.
95659 +  *  1 => The host mapping is usable by guest OS + current application.
95660 +  */
95661 +#define _GNTMAP_application_map (3)
95662 +#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
95663 +
95664 + /*
95665 +  * GNTMAP_contains_pte subflag:
95666 +  *  0 => This map request contains a host virtual address.
95667 +  *  1 => This map request contains the machine addess of the PTE to update.
95668 +  */
95669 +#define _GNTMAP_contains_pte    (4)
95670 +#define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
95671 +
95672 +/*
95673 + * Values for error status returns. All errors are -ve.
95674 + */
95675 +#define GNTST_okay             (0)  /* Normal return.                        */
95676 +#define GNTST_general_error    (-1) /* General undefined error.              */
95677 +#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
95678 +#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
95679 +#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
95680 +#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
95681 +#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
95682 +#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
95683 +#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
95684 +#define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
95685 +#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
95686 +
95687 +#define GNTTABOP_error_msgs {                   \
95688 +    "okay",                                     \
95689 +    "undefined error",                          \
95690 +    "unrecognised domain id",                   \
95691 +    "invalid grant reference",                  \
95692 +    "invalid mapping handle",                   \
95693 +    "invalid virtual address",                  \
95694 +    "invalid device address",                   \
95695 +    "no spare translation slot in the I/O MMU", \
95696 +    "permission denied",                        \
95697 +    "bad page",                                 \
95698 +    "copy arguments cross page boundary"        \
95699 +}
95700 +
95701 +#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
95702 +
95703 +/*
95704 + * Local variables:
95705 + * mode: C
95706 + * c-set-style: "BSD"
95707 + * c-basic-offset: 4
95708 + * tab-width: 4
95709 + * indent-tabs-mode: nil
95710 + * End:
95711 + */
95712 diff -ruNp linux-2.6.19/include/xen/interface/hvm/e820.h linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/e820.h
95713 --- linux-2.6.19/include/xen/interface/hvm/e820.h       1970-01-01 00:00:00.000000000 +0000
95714 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/e820.h     2007-02-02 19:11:00.000000000 +0000
95715 @@ -0,0 +1,47 @@
95716 +
95717 +/*
95718 + * Permission is hereby granted, free of charge, to any person obtaining a copy
95719 + * of this software and associated documentation files (the "Software"), to
95720 + * deal in the Software without restriction, including without limitation the
95721 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
95722 + * sell copies of the Software, and to permit persons to whom the Software is
95723 + * furnished to do so, subject to the following conditions:
95724 + *
95725 + * The above copyright notice and this permission notice shall be included in
95726 + * all copies or substantial portions of the Software.
95727 + *
95728 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
95729 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95730 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
95731 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95732 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
95733 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
95734 + * DEALINGS IN THE SOFTWARE.
95735 + */
95736 +
95737 +#ifndef __XEN_PUBLIC_HVM_E820_H__
95738 +#define __XEN_PUBLIC_HVM_E820_H__
95739 +
95740 +/* PC BIOS standard E820 types. */
95741 +#define E820_RAM          1
95742 +#define E820_RESERVED     2
95743 +#define E820_ACPI         3
95744 +#define E820_NVS          4
95745 +
95746 +/* E820 location in HVM virtual address space. */
95747 +#define E820_MAP_PAGE        0x00090000
95748 +#define E820_MAP_NR_OFFSET   0x000001E8
95749 +#define E820_MAP_OFFSET      0x000002D0
95750 +
95751 +struct e820entry {
95752 +    uint64_t addr;
95753 +    uint64_t size;
95754 +    uint32_t type;
95755 +} __attribute__((packed));
95756 +
95757 +#define HVM_BELOW_4G_RAM_END        0xF0000000
95758 +
95759 +#define HVM_BELOW_4G_MMIO_START     HVM_BELOW_4G_RAM_END
95760 +#define HVM_BELOW_4G_MMIO_LENGTH    ((1ULL << 32) - HVM_BELOW_4G_MMIO_START)
95761 +
95762 +#endif /* __XEN_PUBLIC_HVM_E820_H__ */
95763 diff -ruNp linux-2.6.19/include/xen/interface/hvm/hvm_info_table.h linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/hvm_info_table.h
95764 --- linux-2.6.19/include/xen/interface/hvm/hvm_info_table.h     1970-01-01 00:00:00.000000000 +0000
95765 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/hvm_info_table.h   2007-02-02 19:11:00.000000000 +0000
95766 @@ -0,0 +1,41 @@
95767 +/******************************************************************************
95768 + * hvm/hvm_info_table.h
95769 + * 
95770 + * HVM parameter and information table, written into guest memory map.
95771 + *
95772 + * Permission is hereby granted, free of charge, to any person obtaining a copy
95773 + * of this software and associated documentation files (the "Software"), to
95774 + * deal in the Software without restriction, including without limitation the
95775 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
95776 + * sell copies of the Software, and to permit persons to whom the Software is
95777 + * furnished to do so, subject to the following conditions:
95778 + *
95779 + * The above copyright notice and this permission notice shall be included in
95780 + * all copies or substantial portions of the Software.
95781 + *
95782 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
95783 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95784 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
95785 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95786 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
95787 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
95788 + * DEALINGS IN THE SOFTWARE.
95789 + */
95790 +
95791 +#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
95792 +#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
95793 +
95794 +#define HVM_INFO_PFN         0x09F
95795 +#define HVM_INFO_OFFSET      0x800
95796 +#define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
95797 +
95798 +struct hvm_info_table {
95799 +    char        signature[8]; /* "HVM INFO" */
95800 +    uint32_t    length;
95801 +    uint8_t     checksum;
95802 +    uint8_t     acpi_enabled;
95803 +    uint8_t     apic_mode;
95804 +    uint32_t    nr_vcpus;
95805 +};
95806 +
95807 +#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
95808 diff -ruNp linux-2.6.19/include/xen/interface/hvm/hvm_op.h linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/hvm_op.h
95809 --- linux-2.6.19/include/xen/interface/hvm/hvm_op.h     1970-01-01 00:00:00.000000000 +0000
95810 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/hvm_op.h   2007-02-02 19:11:00.000000000 +0000
95811 @@ -0,0 +1,53 @@
95812 +#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
95813 +#define __XEN_PUBLIC_HVM_HVM_OP_H__
95814 +
95815 +/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */
95816 +#define HVMOP_set_param           0
95817 +#define HVMOP_get_param           1
95818 +struct xen_hvm_param {
95819 +    domid_t  domid;    /* IN */
95820 +    uint32_t index;    /* IN */
95821 +    uint64_t value;    /* IN/OUT */
95822 +};
95823 +typedef struct xen_hvm_param xen_hvm_param_t;
95824 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t);
95825 +
95826 +/* Set the logical level of one of a domain's PCI INTx wires. */
95827 +#define HVMOP_set_pci_intx_level  2
95828 +struct xen_hvm_set_pci_intx_level {
95829 +    /* Domain to be updated. */
95830 +    domid_t  domid;
95831 +    /* PCI INTx identification in PCI topology (domain:bus:device:intx). */
95832 +    uint8_t  domain, bus, device, intx;
95833 +    /* Assertion level (0 = unasserted, 1 = asserted). */
95834 +    uint8_t  level;
95835 +};
95836 +typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t;
95837 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t);
95838 +
95839 +/* Set the logical level of one of a domain's ISA IRQ wires. */
95840 +#define HVMOP_set_isa_irq_level   3
95841 +struct xen_hvm_set_isa_irq_level {
95842 +    /* Domain to be updated. */
95843 +    domid_t  domid;
95844 +    /* ISA device identification, by ISA IRQ (0-15). */
95845 +    uint8_t  isa_irq;
95846 +    /* Assertion level (0 = unasserted, 1 = asserted). */
95847 +    uint8_t  level;
95848 +};
95849 +typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t;
95850 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t);
95851 +
95852 +#define HVMOP_set_pci_link_route  4
95853 +struct xen_hvm_set_pci_link_route {
95854 +    /* Domain to be updated. */
95855 +    domid_t  domid;
95856 +    /* PCI link identifier (0-3). */
95857 +    uint8_t  link;
95858 +    /* ISA IRQ (1-15), or 0 (disable link). */
95859 +    uint8_t  isa_irq;
95860 +};
95861 +typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t;
95862 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
95863 +
95864 +#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
95865 diff -ruNp linux-2.6.19/include/xen/interface/hvm/ioreq.h linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/ioreq.h
95866 --- linux-2.6.19/include/xen/interface/hvm/ioreq.h      1970-01-01 00:00:00.000000000 +0000
95867 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/ioreq.h    2007-02-02 19:11:00.000000000 +0000
95868 @@ -0,0 +1,97 @@
95869 +/*
95870 + * ioreq.h: I/O request definitions for device models
95871 + * Copyright (c) 2004, Intel Corporation.
95872 + * 
95873 + * Permission is hereby granted, free of charge, to any person obtaining a copy
95874 + * of this software and associated documentation files (the "Software"), to
95875 + * deal in the Software without restriction, including without limitation the
95876 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
95877 + * sell copies of the Software, and to permit persons to whom the Software is
95878 + * furnished to do so, subject to the following conditions:
95879 + *
95880 + * The above copyright notice and this permission notice shall be included in
95881 + * all copies or substantial portions of the Software.
95882 + *
95883 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
95884 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95885 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
95886 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95887 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
95888 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
95889 + * DEALINGS IN THE SOFTWARE.
95890 + */
95891 +
95892 +#ifndef _IOREQ_H_
95893 +#define _IOREQ_H_
95894 +
95895 +#define IOREQ_READ      1
95896 +#define IOREQ_WRITE     0
95897 +
95898 +#define STATE_IOREQ_NONE        0
95899 +#define STATE_IOREQ_READY       1
95900 +#define STATE_IOREQ_INPROCESS   2
95901 +#define STATE_IORESP_READY      3
95902 +
95903 +#define IOREQ_TYPE_PIO          0 /* pio */
95904 +#define IOREQ_TYPE_COPY         1 /* mmio ops */
95905 +#define IOREQ_TYPE_AND          2
95906 +#define IOREQ_TYPE_OR           3
95907 +#define IOREQ_TYPE_XOR          4
95908 +#define IOREQ_TYPE_XCHG         5
95909 +#define IOREQ_TYPE_ADD          6
95910 +
95911 +/*
95912 + * VMExit dispatcher should cooperate with instruction decoder to
95913 + * prepare this structure and notify service OS and DM by sending
95914 + * virq
95915 + */
95916 +struct ioreq {
95917 +    uint64_t addr;          /*  physical address            */
95918 +    uint64_t size;          /*  size in bytes               */
95919 +    uint64_t count;         /*  for rep prefixes            */
95920 +    uint64_t data;          /*  data (or paddr of data)     */
95921 +    uint8_t state:4;
95922 +    uint8_t data_is_ptr:1;  /*  if 1, data above is the guest paddr 
95923 +                             *   of the real data to use.   */
95924 +    uint8_t dir:1;          /*  1=read, 0=write             */
95925 +    uint8_t df:1;
95926 +    uint8_t type;           /* I/O type                     */
95927 +    uint64_t io_count;      /* How many IO done on a vcpu   */
95928 +};
95929 +typedef struct ioreq ioreq_t;
95930 +
95931 +struct vcpu_iodata {
95932 +    struct ioreq         vp_ioreq;
95933 +    /* Event channel port */
95934 +    unsigned int    vp_eport;   /* VMX vcpu uses this to notify DM */
95935 +};
95936 +typedef struct vcpu_iodata vcpu_iodata_t;
95937 +
95938 +struct shared_iopage {
95939 +    struct vcpu_iodata   vcpu_iodata[1];
95940 +};
95941 +typedef struct shared_iopage shared_iopage_t;
95942 +
95943 +#define IOREQ_BUFFER_SLOT_NUM     80
95944 +struct buffered_iopage {
95945 +    unsigned long   read_pointer;
95946 +    unsigned long   write_pointer;
95947 +    ioreq_t         ioreq[IOREQ_BUFFER_SLOT_NUM];
95948 +};            /* sizeof this structure must be in one page */
95949 +typedef struct buffered_iopage buffered_iopage_t;
95950 +
95951 +#define ACPI_PM1A_EVT_BLK_ADDRESS           0x0000000000001f40
95952 +#define ACPI_PM1A_CNT_BLK_ADDRESS           (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04)
95953 +#define ACPI_PM_TMR_BLK_ADDRESS             (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08)
95954 +
95955 +#endif /* _IOREQ_H_ */
95956 +
95957 +/*
95958 + * Local variables:
95959 + * mode: C
95960 + * c-set-style: "BSD"
95961 + * c-basic-offset: 4
95962 + * tab-width: 4
95963 + * indent-tabs-mode: nil
95964 + * End:
95965 + */
95966 diff -ruNp linux-2.6.19/include/xen/interface/hvm/params.h linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/params.h
95967 --- linux-2.6.19/include/xen/interface/hvm/params.h     1970-01-01 00:00:00.000000000 +0000
95968 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/params.h   2007-02-02 19:11:00.000000000 +0000
95969 @@ -0,0 +1,36 @@
95970 +
95971 +/*
95972 + * Permission is hereby granted, free of charge, to any person obtaining a copy
95973 + * of this software and associated documentation files (the "Software"), to
95974 + * deal in the Software without restriction, including without limitation the
95975 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
95976 + * sell copies of the Software, and to permit persons to whom the Software is
95977 + * furnished to do so, subject to the following conditions:
95978 + *
95979 + * The above copyright notice and this permission notice shall be included in
95980 + * all copies or substantial portions of the Software.
95981 + *
95982 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
95983 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
95984 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
95985 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95986 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
95987 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
95988 + * DEALINGS IN THE SOFTWARE.
95989 + */
95990 +
95991 +#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
95992 +#define __XEN_PUBLIC_HVM_PARAMS_H__
95993 +
95994 +#include "hvm_op.h"
95995 +
95996 +/* Parameter space for HVMOP_{set,get}_param. */
95997 +#define HVM_PARAM_CALLBACK_IRQ 0
95998 +#define HVM_PARAM_STORE_PFN    1
95999 +#define HVM_PARAM_STORE_EVTCHN 2
96000 +#define HVM_PARAM_PAE_ENABLED  4
96001 +#define HVM_PARAM_IOREQ_PFN    5
96002 +#define HVM_PARAM_BUFIOREQ_PFN 6
96003 +#define HVM_NR_PARAMS          7
96004 +
96005 +#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
96006 diff -ruNp linux-2.6.19/include/xen/interface/hvm/vmx_assist.h linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/vmx_assist.h
96007 --- linux-2.6.19/include/xen/interface/hvm/vmx_assist.h 1970-01-01 00:00:00.000000000 +0000
96008 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/hvm/vmx_assist.h       2007-02-02 19:11:00.000000000 +0000
96009 @@ -0,0 +1,116 @@
96010 +/*
96011 + * vmx_assist.h: Context definitions for the VMXASSIST world switch.
96012 + *
96013 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96014 + * of this software and associated documentation files (the "Software"), to
96015 + * deal in the Software without restriction, including without limitation the
96016 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96017 + * sell copies of the Software, and to permit persons to whom the Software is
96018 + * furnished to do so, subject to the following conditions:
96019 + *
96020 + * The above copyright notice and this permission notice shall be included in
96021 + * all copies or substantial portions of the Software.
96022 + *
96023 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96024 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96025 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96026 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96027 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96028 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96029 + * DEALINGS IN THE SOFTWARE.
96030 + *
96031 + * Leendert van Doorn, leendert@watson.ibm.com
96032 + * Copyright (c) 2005, International Business Machines Corporation.
96033 + */
96034 +
96035 +#ifndef _VMX_ASSIST_H_
96036 +#define _VMX_ASSIST_H_
96037 +
96038 +#define VMXASSIST_BASE         0xD0000
96039 +#define VMXASSIST_MAGIC        0x17101966
96040 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8)
96041 +
96042 +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12)
96043 +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4)
96044 +
96045 +#ifndef __ASSEMBLY__
96046 +
96047 +union vmcs_arbytes {
96048 +    struct arbyte_fields {
96049 +        unsigned int seg_type : 4,
96050 +            s         : 1,
96051 +            dpl       : 2,
96052 +            p         : 1,
96053 +            reserved0 : 4,
96054 +            avl       : 1,
96055 +            reserved1 : 1,
96056 +            default_ops_size: 1,
96057 +            g         : 1,
96058 +            null_bit  : 1,
96059 +            reserved2 : 15;
96060 +    } fields;
96061 +    unsigned int bytes;
96062 +};
96063 +
96064 +/*
96065 + * World switch state
96066 + */
96067 +struct vmx_assist_context {
96068 +    uint32_t  eip;        /* execution pointer */
96069 +    uint32_t  esp;        /* stack pointer */
96070 +    uint32_t  eflags;     /* flags register */
96071 +    uint32_t  cr0;
96072 +    uint32_t  cr3;        /* page table directory */
96073 +    uint32_t  cr4;
96074 +    uint32_t  idtr_limit; /* idt */
96075 +    uint32_t  idtr_base;
96076 +    uint32_t  gdtr_limit; /* gdt */
96077 +    uint32_t  gdtr_base;
96078 +    uint32_t  cs_sel;     /* cs selector */
96079 +    uint32_t  cs_limit;
96080 +    uint32_t  cs_base;
96081 +    union vmcs_arbytes cs_arbytes;
96082 +    uint32_t  ds_sel;     /* ds selector */
96083 +    uint32_t  ds_limit;
96084 +    uint32_t  ds_base;
96085 +    union vmcs_arbytes ds_arbytes;
96086 +    uint32_t  es_sel;     /* es selector */
96087 +    uint32_t  es_limit;
96088 +    uint32_t  es_base;
96089 +    union vmcs_arbytes es_arbytes;
96090 +    uint32_t  ss_sel;     /* ss selector */
96091 +    uint32_t  ss_limit;
96092 +    uint32_t  ss_base;
96093 +    union vmcs_arbytes ss_arbytes;
96094 +    uint32_t  fs_sel;     /* fs selector */
96095 +    uint32_t  fs_limit;
96096 +    uint32_t  fs_base;
96097 +    union vmcs_arbytes fs_arbytes;
96098 +    uint32_t  gs_sel;     /* gs selector */
96099 +    uint32_t  gs_limit;
96100 +    uint32_t  gs_base;
96101 +    union vmcs_arbytes gs_arbytes;
96102 +    uint32_t  tr_sel;     /* task selector */
96103 +    uint32_t  tr_limit;
96104 +    uint32_t  tr_base;
96105 +    union vmcs_arbytes tr_arbytes;
96106 +    uint32_t  ldtr_sel;   /* ldtr selector */
96107 +    uint32_t  ldtr_limit;
96108 +    uint32_t  ldtr_base;
96109 +    union vmcs_arbytes ldtr_arbytes;
96110 +};
96111 +typedef struct vmx_assist_context vmx_assist_context_t;
96112 +
96113 +#endif /* __ASSEMBLY__ */
96114 +
96115 +#endif /* _VMX_ASSIST_H_ */
96116 +
96117 +/*
96118 + * Local variables:
96119 + * mode: C
96120 + * c-set-style: "BSD"
96121 + * c-basic-offset: 4
96122 + * tab-width: 4
96123 + * indent-tabs-mode: nil
96124 + * End:
96125 + */
96126 diff -ruNp linux-2.6.19/include/xen/interface/io/blkif.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/blkif.h
96127 --- linux-2.6.19/include/xen/interface/io/blkif.h       1970-01-01 00:00:00.000000000 +0000
96128 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/blkif.h     2007-02-02 19:11:00.000000000 +0000
96129 @@ -0,0 +1,126 @@
96130 +/******************************************************************************
96131 + * blkif.h
96132 + * 
96133 + * Unified block-device I/O interface for Xen guest OSes.
96134 + * 
96135 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96136 + * of this software and associated documentation files (the "Software"), to
96137 + * deal in the Software without restriction, including without limitation the
96138 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96139 + * sell copies of the Software, and to permit persons to whom the Software is
96140 + * furnished to do so, subject to the following conditions:
96141 + *
96142 + * The above copyright notice and this permission notice shall be included in
96143 + * all copies or substantial portions of the Software.
96144 + *
96145 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96146 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96147 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96148 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96149 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96150 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96151 + * DEALINGS IN THE SOFTWARE.
96152 + *
96153 + * Copyright (c) 2003-2004, Keir Fraser
96154 + */
96155 +
96156 +#ifndef __XEN_PUBLIC_IO_BLKIF_H__
96157 +#define __XEN_PUBLIC_IO_BLKIF_H__
96158 +
96159 +#include "ring.h"
96160 +#include "../grant_table.h"
96161 +
96162 +/*
96163 + * Front->back notifications: When enqueuing a new request, sending a
96164 + * notification can be made conditional on req_event (i.e., the generic
96165 + * hold-off mechanism provided by the ring macros). Backends must set
96166 + * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
96167 + * 
96168 + * Back->front notifications: When enqueuing a new response, sending a
96169 + * notification can be made conditional on rsp_event (i.e., the generic
96170 + * hold-off mechanism provided by the ring macros). Frontends must set
96171 + * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
96172 + */
96173 +
96174 +#ifndef blkif_vdev_t
96175 +#define blkif_vdev_t   uint16_t
96176 +#endif
96177 +#define blkif_sector_t uint64_t
96178 +
96179 +/*
96180 + * REQUEST CODES.
96181 + */
96182 +#define BLKIF_OP_READ              0
96183 +#define BLKIF_OP_WRITE             1
96184 +/*
96185 + * Recognised only if "feature-barrier" is present in backend xenbus info.
96186 + * The "feature_barrier" node contains a boolean indicating whether barrier
96187 + * requests are likely to succeed or fail. Either way, a barrier request
96188 + * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
96189 + * the underlying block-device hardware. The boolean simply indicates whether
96190 + * or not it is worthwhile for the frontend to attempt barrier requests.
96191 + * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
96192 + * create the "feature-barrier" node!
96193 + */
96194 +#define BLKIF_OP_WRITE_BARRIER     2
96195 +
96196 +/*
96197 + * Maximum scatter/gather segments per request.
96198 + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
96199 + * NB. This could be 12 if the ring indexes weren't stored in the same page.
96200 + */
96201 +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
96202 +
96203 +struct blkif_request {
96204 +    uint8_t        operation;    /* BLKIF_OP_???                         */
96205 +    uint8_t        nr_segments;  /* number of segments                   */
96206 +    blkif_vdev_t   handle;       /* only for read/write requests         */
96207 +    uint64_t       id;           /* private guest value, echoed in resp  */
96208 +    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
96209 +    struct blkif_request_segment {
96210 +        grant_ref_t gref;        /* reference to I/O buffer frame        */
96211 +        /* @first_sect: first sector in frame to transfer (inclusive).   */
96212 +        /* @last_sect: last sector in frame to transfer (inclusive).     */
96213 +        uint8_t     first_sect, last_sect;
96214 +    } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
96215 +};
96216 +typedef struct blkif_request blkif_request_t;
96217 +
96218 +struct blkif_response {
96219 +    uint64_t        id;              /* copied from request */
96220 +    uint8_t         operation;       /* copied from request */
96221 +    int16_t         status;          /* BLKIF_RSP_???       */
96222 +};
96223 +typedef struct blkif_response blkif_response_t;
96224 +
96225 +/*
96226 + * STATUS RETURN CODES.
96227 + */
96228 + /* Operation not supported (only happens on barrier writes). */
96229 +#define BLKIF_RSP_EOPNOTSUPP  -2
96230 + /* Operation failed for some unspecified reason (-EIO). */
96231 +#define BLKIF_RSP_ERROR       -1
96232 + /* Operation completed successfully. */
96233 +#define BLKIF_RSP_OKAY         0
96234 +
96235 +/*
96236 + * Generate blkif ring structures and types.
96237 + */
96238 +
96239 +DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
96240 +
96241 +#define VDISK_CDROM        0x1
96242 +#define VDISK_REMOVABLE    0x2
96243 +#define VDISK_READONLY     0x4
96244 +
96245 +#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
96246 +
96247 +/*
96248 + * Local variables:
96249 + * mode: C
96250 + * c-set-style: "BSD"
96251 + * c-basic-offset: 4
96252 + * tab-width: 4
96253 + * indent-tabs-mode: nil
96254 + * End:
96255 + */
96256 diff -ruNp linux-2.6.19/include/xen/interface/io/console.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/console.h
96257 --- linux-2.6.19/include/xen/interface/io/console.h     1970-01-01 00:00:00.000000000 +0000
96258 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/console.h   2007-02-02 19:11:00.000000000 +0000
96259 @@ -0,0 +1,51 @@
96260 +/******************************************************************************
96261 + * console.h
96262 + * 
96263 + * Console I/O interface for Xen guest OSes.
96264 + * 
96265 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96266 + * of this software and associated documentation files (the "Software"), to
96267 + * deal in the Software without restriction, including without limitation the
96268 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96269 + * sell copies of the Software, and to permit persons to whom the Software is
96270 + * furnished to do so, subject to the following conditions:
96271 + *
96272 + * The above copyright notice and this permission notice shall be included in
96273 + * all copies or substantial portions of the Software.
96274 + *
96275 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96276 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96277 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96278 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96279 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96280 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96281 + * DEALINGS IN THE SOFTWARE.
96282 + *
96283 + * Copyright (c) 2005, Keir Fraser
96284 + */
96285 +
96286 +#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
96287 +#define __XEN_PUBLIC_IO_CONSOLE_H__
96288 +
96289 +typedef uint32_t XENCONS_RING_IDX;
96290 +
96291 +#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
96292 +
96293 +struct xencons_interface {
96294 +    char in[1024];
96295 +    char out[2048];
96296 +    XENCONS_RING_IDX in_cons, in_prod;
96297 +    XENCONS_RING_IDX out_cons, out_prod;
96298 +};
96299 +
96300 +#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
96301 +
96302 +/*
96303 + * Local variables:
96304 + * mode: C
96305 + * c-set-style: "BSD"
96306 + * c-basic-offset: 4
96307 + * tab-width: 4
96308 + * indent-tabs-mode: nil
96309 + * End:
96310 + */
96311 diff -ruNp linux-2.6.19/include/xen/interface/io/fbif.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/fbif.h
96312 --- linux-2.6.19/include/xen/interface/io/fbif.h        1970-01-01 00:00:00.000000000 +0000
96313 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/fbif.h      2007-02-02 19:11:00.000000000 +0000
96314 @@ -0,0 +1,138 @@
96315 +/*
96316 + * fbif.h -- Xen virtual frame buffer device
96317 + *
96318 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96319 + * of this software and associated documentation files (the "Software"), to
96320 + * deal in the Software without restriction, including without limitation the
96321 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96322 + * sell copies of the Software, and to permit persons to whom the Software is
96323 + * furnished to do so, subject to the following conditions:
96324 + *
96325 + * The above copyright notice and this permission notice shall be included in
96326 + * all copies or substantial portions of the Software.
96327 + *
96328 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96329 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96330 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96331 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96332 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96333 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96334 + * DEALINGS IN THE SOFTWARE.
96335 + *
96336 + * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
96337 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
96338 + */
96339 +
96340 +#ifndef __XEN_PUBLIC_IO_FBIF_H__
96341 +#define __XEN_PUBLIC_IO_FBIF_H__
96342 +
96343 +/* Out events (frontend -> backend) */
96344 +
96345 +/*
96346 + * Out events may be sent only when requested by backend, and receipt
96347 + * of an unknown out event is an error.
96348 + */
96349 +
96350 +/* Event type 1 currently not used */
96351 +/*
96352 + * Framebuffer update notification event
96353 + * Capable frontend sets feature-update in xenstore.
96354 + * Backend requests it by setting request-update in xenstore.
96355 + */
96356 +#define XENFB_TYPE_UPDATE 2
96357 +
96358 +struct xenfb_update
96359 +{
96360 +    uint8_t type;    /* XENFB_TYPE_UPDATE */
96361 +    int32_t x;      /* source x */
96362 +    int32_t y;      /* source y */
96363 +    int32_t width;  /* rect width */
96364 +    int32_t height; /* rect height */
96365 +};
96366 +
96367 +#define XENFB_OUT_EVENT_SIZE 40
96368 +
96369 +union xenfb_out_event
96370 +{
96371 +    uint8_t type;
96372 +    struct xenfb_update update;
96373 +    char pad[XENFB_OUT_EVENT_SIZE];
96374 +};
96375 +
96376 +/* In events (backend -> frontend) */
96377 +
96378 +/*
96379 + * Frontends should ignore unknown in events.
96380 + * No in events currently defined.
96381 + */
96382 +
96383 +#define XENFB_IN_EVENT_SIZE 40
96384 +
96385 +union xenfb_in_event
96386 +{
96387 +    uint8_t type;
96388 +    char pad[XENFB_IN_EVENT_SIZE];
96389 +};
96390 +
96391 +/* shared page */
96392 +
96393 +#define XENFB_IN_RING_SIZE 1024
96394 +#define XENFB_IN_RING_LEN (XENFB_IN_RING_SIZE / XENFB_IN_EVENT_SIZE)
96395 +#define XENFB_IN_RING_OFFS 1024
96396 +#define XENFB_IN_RING(page) \
96397 +    ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS))
96398 +#define XENFB_IN_RING_REF(page, idx) \
96399 +    (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN])
96400 +
96401 +#define XENFB_OUT_RING_SIZE 2048
96402 +#define XENFB_OUT_RING_LEN (XENFB_OUT_RING_SIZE / XENFB_OUT_EVENT_SIZE)
96403 +#define XENFB_OUT_RING_OFFS (XENFB_IN_RING_OFFS + XENFB_IN_RING_SIZE)
96404 +#define XENFB_OUT_RING(page) \
96405 +    ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS))
96406 +#define XENFB_OUT_RING_REF(page, idx) \
96407 +    (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN])
96408 +
96409 +struct xenfb_page
96410 +{
96411 +    uint32_t in_cons, in_prod;
96412 +    uint32_t out_cons, out_prod;
96413 +
96414 +    int32_t width;          /* the width of the framebuffer (in pixels) */
96415 +    int32_t height;         /* the height of the framebuffer (in pixels) */
96416 +    uint32_t line_length;   /* the length of a row of pixels (in bytes) */
96417 +    uint32_t mem_length;    /* the length of the framebuffer (in bytes) */
96418 +    uint8_t depth;          /* the depth of a pixel (in bits) */
96419 +
96420 +    /*
96421 +     * Framebuffer page directory
96422 +     *
96423 +     * Each directory page holds PAGE_SIZE / sizeof(*pd)
96424 +     * framebuffer pages, and can thus map up to PAGE_SIZE *
96425 +     * PAGE_SIZE / sizeof(*pd) bytes.  With PAGE_SIZE == 4096 and
96426 +     * sizeof(unsigned long) == 4, that's 4 Megs.  Two directory
96427 +     * pages should be enough for a while.
96428 +     */
96429 +    unsigned long pd[2];
96430 +};
96431 +
96432 +/*
96433 + * Wart: xenkbd needs to know resolution.  Put it here until a better
96434 + * solution is found, but don't leak it to the backend.
96435 + */
96436 +#ifdef __KERNEL__
96437 +#define XENFB_WIDTH 800
96438 +#define XENFB_HEIGHT 600
96439 +#define XENFB_DEPTH 32
96440 +#endif
96441 +
96442 +#endif
96443 +
96444 +/*
96445 + * Local variables:
96446 + * mode: C
96447 + * c-set-style: "BSD"
96448 + * c-basic-offset: 4
96449 + * tab-width: 4
96450 + * indent-tabs-mode: nil
96451 + * End:
96452 + */
96453 diff -ruNp linux-2.6.19/include/xen/interface/io/kbdif.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/kbdif.h
96454 --- linux-2.6.19/include/xen/interface/io/kbdif.h       1970-01-01 00:00:00.000000000 +0000
96455 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/kbdif.h     2007-02-02 19:11:00.000000000 +0000
96456 @@ -0,0 +1,130 @@
96457 +/*
96458 + * kbdif.h -- Xen virtual keyboard/mouse
96459 + *
96460 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96461 + * of this software and associated documentation files (the "Software"), to
96462 + * deal in the Software without restriction, including without limitation the
96463 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96464 + * sell copies of the Software, and to permit persons to whom the Software is
96465 + * furnished to do so, subject to the following conditions:
96466 + *
96467 + * The above copyright notice and this permission notice shall be included in
96468 + * all copies or substantial portions of the Software.
96469 + *
96470 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96471 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96472 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96473 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96474 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96475 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96476 + * DEALINGS IN THE SOFTWARE.
96477 + *
96478 + * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
96479 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
96480 + */
96481 +
96482 +#ifndef __XEN_PUBLIC_IO_KBDIF_H__
96483 +#define __XEN_PUBLIC_IO_KBDIF_H__
96484 +
96485 +/* In events (backend -> frontend) */
96486 +
96487 +/*
96488 + * Frontends should ignore unknown in events.
96489 + */
96490 +
96491 +/* Pointer movement event */
96492 +#define XENKBD_TYPE_MOTION  1
96493 +/* Event type 2 currently not used */
96494 +/* Key event (includes pointer buttons) */
96495 +#define XENKBD_TYPE_KEY     3
96496 +/*
96497 + * Pointer position event
96498 + * Capable backend sets feature-abs-pointer in xenstore.
96499 + * Frontend requests ot instead of XENKBD_TYPE_MOTION by setting
96500 + * request-abs-update in xenstore.
96501 + */
96502 +#define XENKBD_TYPE_POS     4
96503 +
96504 +struct xenkbd_motion
96505 +{
96506 +    uint8_t type;        /* XENKBD_TYPE_MOTION */
96507 +    int32_t rel_x;       /* relative X motion */
96508 +    int32_t rel_y;       /* relative Y motion */
96509 +};
96510 +
96511 +struct xenkbd_key
96512 +{
96513 +    uint8_t type;         /* XENKBD_TYPE_KEY */
96514 +    uint8_t pressed;      /* 1 if pressed; 0 otherwise */
96515 +    uint32_t keycode;     /* KEY_* from linux/input.h */
96516 +};
96517 +
96518 +struct xenkbd_position
96519 +{
96520 +    uint8_t type;        /* XENKBD_TYPE_POS */
96521 +    int32_t abs_x;       /* absolute X position (in FB pixels) */
96522 +    int32_t abs_y;       /* absolute Y position (in FB pixels) */
96523 +};
96524 +
96525 +#define XENKBD_IN_EVENT_SIZE 40
96526 +
96527 +union xenkbd_in_event
96528 +{
96529 +    uint8_t type;
96530 +    struct xenkbd_motion motion;
96531 +    struct xenkbd_key key;
96532 +    struct xenkbd_position pos;
96533 +    char pad[XENKBD_IN_EVENT_SIZE];
96534 +};
96535 +
96536 +/* Out events (frontend -> backend) */
96537 +
96538 +/*
96539 + * Out events may be sent only when requested by backend, and receipt
96540 + * of an unknown out event is an error.
96541 + * No out events currently defined.
96542 + */
96543 +
96544 +#define XENKBD_OUT_EVENT_SIZE 40
96545 +
96546 +union xenkbd_out_event
96547 +{
96548 +    uint8_t type;
96549 +    char pad[XENKBD_OUT_EVENT_SIZE];
96550 +};
96551 +
96552 +/* shared page */
96553 +
96554 +#define XENKBD_IN_RING_SIZE 2048
96555 +#define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE)
96556 +#define XENKBD_IN_RING_OFFS 1024
96557 +#define XENKBD_IN_RING(page) \
96558 +    ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS))
96559 +#define XENKBD_IN_RING_REF(page, idx) \
96560 +    (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN])
96561 +
96562 +#define XENKBD_OUT_RING_SIZE 1024
96563 +#define XENKBD_OUT_RING_LEN (XENKBD_OUT_RING_SIZE / XENKBD_OUT_EVENT_SIZE)
96564 +#define XENKBD_OUT_RING_OFFS (XENKBD_IN_RING_OFFS + XENKBD_IN_RING_SIZE)
96565 +#define XENKBD_OUT_RING(page) \
96566 +    ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS))
96567 +#define XENKBD_OUT_RING_REF(page, idx) \
96568 +    (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN])
96569 +
96570 +struct xenkbd_page
96571 +{
96572 +    uint32_t in_cons, in_prod;
96573 +    uint32_t out_cons, out_prod;
96574 +};
96575 +
96576 +#endif
96577 +
96578 +/*
96579 + * Local variables:
96580 + * mode: C
96581 + * c-set-style: "BSD"
96582 + * c-basic-offset: 4
96583 + * tab-width: 4
96584 + * indent-tabs-mode: nil
96585 + * End:
96586 + */
96587 diff -ruNp linux-2.6.19/include/xen/interface/io/netif.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/netif.h
96588 --- linux-2.6.19/include/xen/interface/io/netif.h       1970-01-01 00:00:00.000000000 +0000
96589 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/netif.h     2007-02-02 19:11:00.000000000 +0000
96590 @@ -0,0 +1,184 @@
96591 +/******************************************************************************
96592 + * netif.h
96593 + * 
96594 + * Unified network-device I/O interface for Xen guest OSes.
96595 + * 
96596 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96597 + * of this software and associated documentation files (the "Software"), to
96598 + * deal in the Software without restriction, including without limitation the
96599 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96600 + * sell copies of the Software, and to permit persons to whom the Software is
96601 + * furnished to do so, subject to the following conditions:
96602 + *
96603 + * The above copyright notice and this permission notice shall be included in
96604 + * all copies or substantial portions of the Software.
96605 + *
96606 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96607 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96608 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96609 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96610 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96611 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96612 + * DEALINGS IN THE SOFTWARE.
96613 + *
96614 + * Copyright (c) 2003-2004, Keir Fraser
96615 + */
96616 +
96617 +#ifndef __XEN_PUBLIC_IO_NETIF_H__
96618 +#define __XEN_PUBLIC_IO_NETIF_H__
96619 +
96620 +#include "ring.h"
96621 +#include "../grant_table.h"
96622 +
96623 +/*
96624 + * Notifications after enqueuing any type of message should be conditional on
96625 + * the appropriate req_event or rsp_event field in the shared ring.
96626 + * If the client sends notification for rx requests then it should specify
96627 + * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume
96628 + * that it cannot safely queue packets (as it may not be kicked to send them).
96629 + */
96630 +
96631 +/*
96632 + * This is the 'wire' format for packets:
96633 + *  Request 1: netif_tx_request -- NETTXF_* (any flags)
96634 + * [Request 2: netif_tx_extra]  (only if request 1 has NETTXF_extra_info)
96635 + * [Request 3: netif_tx_extra]  (only if request 2 has XEN_NETIF_EXTRA_MORE)
96636 + *  Request 4: netif_tx_request -- NETTXF_more_data
96637 + *  Request 5: netif_tx_request -- NETTXF_more_data
96638 + *  ...
96639 + *  Request N: netif_tx_request -- 0
96640 + */
96641 +
96642 +/* Protocol checksum field is blank in the packet (hardware offload)? */
96643 +#define _NETTXF_csum_blank     (0)
96644 +#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
96645 +
96646 +/* Packet data has been validated against protocol checksum. */
96647 +#define _NETTXF_data_validated (1)
96648 +#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
96649 +
96650 +/* Packet continues in the next request descriptor. */
96651 +#define _NETTXF_more_data      (2)
96652 +#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
96653 +
96654 +/* Packet to be followed by extra descriptor(s). */
96655 +#define _NETTXF_extra_info     (3)
96656 +#define  NETTXF_extra_info     (1U<<_NETTXF_extra_info)
96657 +
96658 +struct netif_tx_request {
96659 +    grant_ref_t gref;      /* Reference to buffer page */
96660 +    uint16_t offset;       /* Offset within buffer page */
96661 +    uint16_t flags;        /* NETTXF_* */
96662 +    uint16_t id;           /* Echoed in response message. */
96663 +    uint16_t size;         /* Packet size in bytes.       */
96664 +};
96665 +typedef struct netif_tx_request netif_tx_request_t;
96666 +
96667 +/* Types of netif_extra_info descriptors. */
96668 +#define XEN_NETIF_EXTRA_TYPE_NONE  (0)  /* Never used - invalid */
96669 +#define XEN_NETIF_EXTRA_TYPE_GSO   (1)  /* u.gso */
96670 +#define XEN_NETIF_EXTRA_TYPE_MAX   (2)
96671 +
96672 +/* netif_extra_info flags. */
96673 +#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
96674 +#define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
96675 +
96676 +/* GSO types - only TCPv4 currently supported. */
96677 +#define XEN_NETIF_GSO_TYPE_TCPV4        (1)
96678 +
96679 +/*
96680 + * This structure needs to fit within both netif_tx_request and
96681 + * netif_rx_response for compatibility.
96682 + */
96683 +struct netif_extra_info {
96684 +    uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
96685 +    uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
96686 +
96687 +    union {
96688 +        struct {
96689 +            /*
96690 +             * Maximum payload size of each segment. For example, for TCP this
96691 +             * is just the path MSS.
96692 +             */
96693 +            uint16_t size;
96694 +
96695 +            /*
96696 +             * GSO type. This determines the protocol of the packet and any
96697 +             * extra features required to segment the packet properly.
96698 +             */
96699 +            uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
96700 +
96701 +            /* Future expansion. */
96702 +            uint8_t pad;
96703 +
96704 +            /*
96705 +             * GSO features. This specifies any extra GSO features required
96706 +             * to process this packet, such as ECN support for TCPv4.
96707 +             */
96708 +            uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
96709 +        } gso;
96710 +
96711 +        uint16_t pad[3];
96712 +    } u;
96713 +};
96714 +
96715 +struct netif_tx_response {
96716 +    uint16_t id;
96717 +    int16_t  status;       /* NETIF_RSP_* */
96718 +};
96719 +typedef struct netif_tx_response netif_tx_response_t;
96720 +
96721 +struct netif_rx_request {
96722 +    uint16_t    id;        /* Echoed in response message.        */
96723 +    grant_ref_t gref;      /* Reference to incoming granted frame */
96724 +};
96725 +typedef struct netif_rx_request netif_rx_request_t;
96726 +
96727 +/* Packet data has been validated against protocol checksum. */
96728 +#define _NETRXF_data_validated (0)
96729 +#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
96730 +
96731 +/* Protocol checksum field is blank in the packet (hardware offload)? */
96732 +#define _NETRXF_csum_blank     (1)
96733 +#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
96734 +
96735 +/* Packet continues in the next request descriptor. */
96736 +#define _NETRXF_more_data      (2)
96737 +#define  NETRXF_more_data      (1U<<_NETRXF_more_data)
96738 +
96739 +/* Packet to be followed by extra descriptor(s). */
96740 +#define _NETRXF_extra_info     (3)
96741 +#define  NETRXF_extra_info     (1U<<_NETRXF_extra_info)
96742 +
96743 +struct netif_rx_response {
96744 +    uint16_t id;
96745 +    uint16_t offset;       /* Offset in page of start of received packet  */
96746 +    uint16_t flags;        /* NETRXF_* */
96747 +    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
96748 +};
96749 +typedef struct netif_rx_response netif_rx_response_t;
96750 +
96751 +/*
96752 + * Generate netif ring structures and types.
96753 + */
96754 +
96755 +DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
96756 +DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
96757 +
96758 +#define NETIF_RSP_DROPPED         -2
96759 +#define NETIF_RSP_ERROR           -1
96760 +#define NETIF_RSP_OKAY             0
96761 +/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
96762 +#define NETIF_RSP_NULL             1
96763 +
96764 +#endif
96765 +
96766 +/*
96767 + * Local variables:
96768 + * mode: C
96769 + * c-set-style: "BSD"
96770 + * c-basic-offset: 4
96771 + * tab-width: 4
96772 + * indent-tabs-mode: nil
96773 + * End:
96774 + */
96775 diff -ruNp linux-2.6.19/include/xen/interface/io/pciif.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/pciif.h
96776 --- linux-2.6.19/include/xen/interface/io/pciif.h       1970-01-01 00:00:00.000000000 +0000
96777 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/pciif.h     2007-02-02 19:11:00.000000000 +0000
96778 @@ -0,0 +1,83 @@
96779 +/*
96780 + * PCI Backend/Frontend Common Data Structures & Macros
96781 + *
96782 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96783 + * of this software and associated documentation files (the "Software"), to
96784 + * deal in the Software without restriction, including without limitation the
96785 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96786 + * sell copies of the Software, and to permit persons to whom the Software is
96787 + * furnished to do so, subject to the following conditions:
96788 + *
96789 + * The above copyright notice and this permission notice shall be included in
96790 + * all copies or substantial portions of the Software.
96791 + *
96792 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96793 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96794 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96795 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96796 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96797 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96798 + * DEALINGS IN THE SOFTWARE.
96799 + *
96800 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
96801 + */
96802 +#ifndef __XEN_PCI_COMMON_H__
96803 +#define __XEN_PCI_COMMON_H__
96804 +
96805 +/* Be sure to bump this number if you change this file */
96806 +#define XEN_PCI_MAGIC "7"
96807 +
96808 +/* xen_pci_sharedinfo flags */
96809 +#define _XEN_PCIF_active     (0)
96810 +#define XEN_PCIF_active      (1<<_XEN_PCI_active)
96811 +
96812 +/* xen_pci_op commands */
96813 +#define XEN_PCI_OP_conf_read    (0)
96814 +#define XEN_PCI_OP_conf_write   (1)
96815 +
96816 +/* xen_pci_op error numbers */
96817 +#define XEN_PCI_ERR_success          (0)
96818 +#define XEN_PCI_ERR_dev_not_found   (-1)
96819 +#define XEN_PCI_ERR_invalid_offset  (-2)
96820 +#define XEN_PCI_ERR_access_denied   (-3)
96821 +#define XEN_PCI_ERR_not_implemented (-4)
96822 +/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
96823 +#define XEN_PCI_ERR_op_failed       (-5)
96824 +
96825 +struct xen_pci_op {
96826 +    /* IN: what action to perform: XEN_PCI_OP_* */
96827 +    uint32_t cmd;
96828 +
96829 +    /* OUT: will contain an error number (if any) from errno.h */
96830 +    int32_t err;
96831 +
96832 +    /* IN: which device to touch */
96833 +    uint32_t domain; /* PCI Domain/Segment */
96834 +    uint32_t bus;
96835 +    uint32_t devfn;
96836 +
96837 +    /* IN: which configuration registers to touch */
96838 +    int32_t offset;
96839 +    int32_t size;
96840 +
96841 +    /* IN/OUT: Contains the result after a READ or the value to WRITE */
96842 +    uint32_t value;
96843 +};
96844 +
96845 +struct xen_pci_sharedinfo {
96846 +    /* flags - XEN_PCIF_* */
96847 +    uint32_t flags;
96848 +    struct xen_pci_op op;
96849 +};
96850 +
96851 +#endif /* __XEN_PCI_COMMON_H__ */
96852 +
96853 +/*
96854 + * Local variables:
96855 + * mode: C
96856 + * c-set-style: "BSD"
96857 + * c-basic-offset: 4
96858 + * tab-width: 4
96859 + * indent-tabs-mode: nil
96860 + * End:
96861 + */
96862 diff -ruNp linux-2.6.19/include/xen/interface/io/ring.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/ring.h
96863 --- linux-2.6.19/include/xen/interface/io/ring.h        1970-01-01 00:00:00.000000000 +0000
96864 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/ring.h      2007-02-02 19:11:00.000000000 +0000
96865 @@ -0,0 +1,299 @@
96866 +/******************************************************************************
96867 + * ring.h
96868 + * 
96869 + * Shared producer-consumer ring macros.
96870 + *
96871 + * Permission is hereby granted, free of charge, to any person obtaining a copy
96872 + * of this software and associated documentation files (the "Software"), to
96873 + * deal in the Software without restriction, including without limitation the
96874 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
96875 + * sell copies of the Software, and to permit persons to whom the Software is
96876 + * furnished to do so, subject to the following conditions:
96877 + *
96878 + * The above copyright notice and this permission notice shall be included in
96879 + * all copies or substantial portions of the Software.
96880 + *
96881 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
96882 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
96883 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
96884 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
96885 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
96886 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
96887 + * DEALINGS IN THE SOFTWARE.
96888 + *
96889 + * Tim Deegan and Andrew Warfield November 2004.
96890 + */
96891 +
96892 +#ifndef __XEN_PUBLIC_IO_RING_H__
96893 +#define __XEN_PUBLIC_IO_RING_H__
96894 +
96895 +typedef unsigned int RING_IDX;
96896 +
96897 +/* Round a 32-bit unsigned constant down to the nearest power of two. */
96898 +#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
96899 +#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
96900 +#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
96901 +#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
96902 +#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
96903 +
96904 +/*
96905 + * Calculate size of a shared ring, given the total available space for the
96906 + * ring and indexes (_sz), and the name tag of the request/response structure.
96907 + * A ring contains as many entries as will fit, rounded down to the nearest 
96908 + * power of two (so we can mask with (size-1) to loop around).
96909 + */
96910 +#define __RING_SIZE(_s, _sz) \
96911 +    (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
96912 +
96913 +/*
96914 + * Macros to make the correct C datatypes for a new kind of ring.
96915 + * 
96916 + * To make a new ring datatype, you need to have two message structures,
96917 + * let's say request_t, and response_t already defined.
96918 + *
96919 + * In a header where you want the ring datatype declared, you then do:
96920 + *
96921 + *     DEFINE_RING_TYPES(mytag, request_t, response_t);
96922 + *
96923 + * These expand out to give you a set of types, as you can see below.
96924 + * The most important of these are:
96925 + * 
96926 + *     mytag_sring_t      - The shared ring.
96927 + *     mytag_front_ring_t - The 'front' half of the ring.
96928 + *     mytag_back_ring_t  - The 'back' half of the ring.
96929 + *
96930 + * To initialize a ring in your code you need to know the location and size
96931 + * of the shared memory area (PAGE_SIZE, for instance). To initialise
96932 + * the front half:
96933 + *
96934 + *     mytag_front_ring_t front_ring;
96935 + *     SHARED_RING_INIT((mytag_sring_t *)shared_page);
96936 + *     FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
96937 + *
96938 + * Initializing the back follows similarly (note that only the front
96939 + * initializes the shared ring):
96940 + *
96941 + *     mytag_back_ring_t back_ring;
96942 + *     BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
96943 + */
96944 +
96945 +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
96946 +                                                                        \
96947 +/* Shared ring entry */                                                 \
96948 +union __name##_sring_entry {                                            \
96949 +    __req_t req;                                                        \
96950 +    __rsp_t rsp;                                                        \
96951 +};                                                                      \
96952 +                                                                        \
96953 +/* Shared ring page */                                                  \
96954 +struct __name##_sring {                                                 \
96955 +    RING_IDX req_prod, req_event;                                       \
96956 +    RING_IDX rsp_prod, rsp_event;                                       \
96957 +    uint8_t  pad[48];                                                   \
96958 +    union __name##_sring_entry ring[1]; /* variable-length */           \
96959 +};                                                                      \
96960 +                                                                        \
96961 +/* "Front" end's private variables */                                   \
96962 +struct __name##_front_ring {                                            \
96963 +    RING_IDX req_prod_pvt;                                              \
96964 +    RING_IDX rsp_cons;                                                  \
96965 +    unsigned int nr_ents;                                               \
96966 +    struct __name##_sring *sring;                                       \
96967 +};                                                                      \
96968 +                                                                        \
96969 +/* "Back" end's private variables */                                    \
96970 +struct __name##_back_ring {                                             \
96971 +    RING_IDX rsp_prod_pvt;                                              \
96972 +    RING_IDX req_cons;                                                  \
96973 +    unsigned int nr_ents;                                               \
96974 +    struct __name##_sring *sring;                                       \
96975 +};                                                                      \
96976 +                                                                        \
96977 +/* Syntactic sugar */                                                   \
96978 +typedef struct __name##_sring __name##_sring_t;                         \
96979 +typedef struct __name##_front_ring __name##_front_ring_t;               \
96980 +typedef struct __name##_back_ring __name##_back_ring_t
96981 +
96982 +/*
96983 + * Macros for manipulating rings.
96984 + * 
96985 + * FRONT_RING_whatever works on the "front end" of a ring: here 
96986 + * requests are pushed on to the ring and responses taken off it.
96987 + * 
96988 + * BACK_RING_whatever works on the "back end" of a ring: here 
96989 + * requests are taken off the ring and responses put on.
96990 + * 
96991 + * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. 
96992 + * This is OK in 1-for-1 request-response situations where the 
96993 + * requestor (front end) never has more than RING_SIZE()-1
96994 + * outstanding requests.
96995 + */
96996 +
96997 +/* Initialising empty rings */
96998 +#define SHARED_RING_INIT(_s) do {                                       \
96999 +    (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
97000 +    (_s)->req_event = (_s)->rsp_event = 1;                              \
97001 +    memset((_s)->pad, 0, sizeof((_s)->pad));                            \
97002 +} while(0)
97003 +
97004 +#define FRONT_RING_INIT(_r, _s, __size) do {                            \
97005 +    (_r)->req_prod_pvt = 0;                                             \
97006 +    (_r)->rsp_cons = 0;                                                 \
97007 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
97008 +    (_r)->sring = (_s);                                                 \
97009 +} while (0)
97010 +
97011 +#define BACK_RING_INIT(_r, _s, __size) do {                             \
97012 +    (_r)->rsp_prod_pvt = 0;                                             \
97013 +    (_r)->req_cons = 0;                                                 \
97014 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
97015 +    (_r)->sring = (_s);                                                 \
97016 +} while (0)
97017 +
97018 +/* Initialize to existing shared indexes -- for recovery */
97019 +#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
97020 +    (_r)->sring = (_s);                                                 \
97021 +    (_r)->req_prod_pvt = (_s)->req_prod;                                \
97022 +    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
97023 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
97024 +} while (0)
97025 +
97026 +#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
97027 +    (_r)->sring = (_s);                                                 \
97028 +    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
97029 +    (_r)->req_cons = (_s)->req_prod;                                    \
97030 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
97031 +} while (0)
97032 +
97033 +/* How big is this ring? */
97034 +#define RING_SIZE(_r)                                                   \
97035 +    ((_r)->nr_ents)
97036 +
97037 +/* Number of free requests (for use on front side only). */
97038 +#define RING_FREE_REQUESTS(_r)                                          \
97039 +    (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
97040 +
97041 +/* Test if there is an empty slot available on the front ring.
97042 + * (This is only meaningful from the front. )
97043 + */
97044 +#define RING_FULL(_r)                                                   \
97045 +    (RING_FREE_REQUESTS(_r) == 0)
97046 +
97047 +/* Test if there are outstanding messages to be processed on a ring. */
97048 +#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
97049 +    ((_r)->sring->rsp_prod - (_r)->rsp_cons)
97050 +
97051 +#ifdef __GNUC__
97052 +#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({                             \
97053 +    unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;          \
97054 +    unsigned int rsp = RING_SIZE(_r) -                                  \
97055 +        ((_r)->req_cons - (_r)->rsp_prod_pvt);                          \
97056 +    req < rsp ? req : rsp;                                              \
97057 +})
97058 +#else
97059 +/* Same as above, but without the nice GCC ({ ... }) syntax. */
97060 +#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
97061 +    ((((_r)->sring->req_prod - (_r)->req_cons) <                        \
97062 +      (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ?        \
97063 +     ((_r)->sring->req_prod - (_r)->req_cons) :                         \
97064 +     (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))
97065 +#endif
97066 +
97067 +/* Direct access to individual ring elements, by index. */
97068 +#define RING_GET_REQUEST(_r, _idx)                                      \
97069 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
97070 +
97071 +#define RING_GET_RESPONSE(_r, _idx)                                     \
97072 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
97073 +
97074 +/* Loop termination condition: Would the specified index overflow the ring? */
97075 +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
97076 +    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
97077 +
97078 +#define RING_PUSH_REQUESTS(_r) do {                                     \
97079 +    wmb(); /* back sees requests /before/ updated producer index */     \
97080 +    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
97081 +} while (0)
97082 +
97083 +#define RING_PUSH_RESPONSES(_r) do {                                    \
97084 +    wmb(); /* front sees responses /before/ updated producer index */   \
97085 +    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
97086 +} while (0)
97087 +
97088 +/*
97089 + * Notification hold-off (req_event and rsp_event):
97090 + * 
97091 + * When queueing requests or responses on a shared ring, it may not always be
97092 + * necessary to notify the remote end. For example, if requests are in flight
97093 + * in a backend, the front may be able to queue further requests without
97094 + * notifying the back (if the back checks for new requests when it queues
97095 + * responses).
97096 + * 
97097 + * When enqueuing requests or responses:
97098 + * 
97099 + *  Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
97100 + *  is a boolean return value. True indicates that the receiver requires an
97101 + *  asynchronous notification.
97102 + * 
97103 + * After dequeuing requests or responses (before sleeping the connection):
97104 + * 
97105 + *  Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
97106 + *  The second argument is a boolean return value. True indicates that there
97107 + *  are pending messages on the ring (i.e., the connection should not be put
97108 + *  to sleep).
97109 + * 
97110 + *  These macros will set the req_event/rsp_event field to trigger a
97111 + *  notification on the very next message that is enqueued. If you want to
97112 + *  create batches of work (i.e., only receive a notification after several
97113 + *  messages have been enqueued) then you will need to create a customised
97114 + *  version of the FINAL_CHECK macro in your own code, which sets the event
97115 + *  field appropriately.
97116 + */
97117 +
97118 +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {           \
97119 +    RING_IDX __old = (_r)->sring->req_prod;                             \
97120 +    RING_IDX __new = (_r)->req_prod_pvt;                                \
97121 +    wmb(); /* back sees requests /before/ updated producer index */     \
97122 +    (_r)->sring->req_prod = __new;                                      \
97123 +    mb(); /* back sees new requests /before/ we check req_event */      \
97124 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <           \
97125 +                 (RING_IDX)(__new - __old));                            \
97126 +} while (0)
97127 +
97128 +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {          \
97129 +    RING_IDX __old = (_r)->sring->rsp_prod;                             \
97130 +    RING_IDX __new = (_r)->rsp_prod_pvt;                                \
97131 +    wmb(); /* front sees responses /before/ updated producer index */   \
97132 +    (_r)->sring->rsp_prod = __new;                                      \
97133 +    mb(); /* front sees new responses /before/ we check rsp_event */    \
97134 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <           \
97135 +                 (RING_IDX)(__new - __old));                            \
97136 +} while (0)
97137 +
97138 +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {             \
97139 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
97140 +    if (_work_to_do) break;                                             \
97141 +    (_r)->sring->req_event = (_r)->req_cons + 1;                        \
97142 +    mb();                                                               \
97143 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
97144 +} while (0)
97145 +
97146 +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {            \
97147 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
97148 +    if (_work_to_do) break;                                             \
97149 +    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                        \
97150 +    mb();                                                               \
97151 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
97152 +} while (0)
97153 +
97154 +#endif /* __XEN_PUBLIC_IO_RING_H__ */
97155 +
97156 +/*
97157 + * Local variables:
97158 + * mode: C
97159 + * c-set-style: "BSD"
97160 + * c-basic-offset: 4
97161 + * tab-width: 4
97162 + * indent-tabs-mode: nil
97163 + * End:
97164 + */
97165 diff -ruNp linux-2.6.19/include/xen/interface/io/tpmif.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/tpmif.h
97166 --- linux-2.6.19/include/xen/interface/io/tpmif.h       1970-01-01 00:00:00.000000000 +0000
97167 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/tpmif.h     2007-02-02 19:11:00.000000000 +0000
97168 @@ -0,0 +1,77 @@
97169 +/******************************************************************************
97170 + * tpmif.h
97171 + *
97172 + * TPM I/O interface for Xen guest OSes.
97173 + *
97174 + * Permission is hereby granted, free of charge, to any person obtaining a copy
97175 + * of this software and associated documentation files (the "Software"), to
97176 + * deal in the Software without restriction, including without limitation the
97177 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
97178 + * sell copies of the Software, and to permit persons to whom the Software is
97179 + * furnished to do so, subject to the following conditions:
97180 + *
97181 + * The above copyright notice and this permission notice shall be included in
97182 + * all copies or substantial portions of the Software.
97183 + *
97184 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
97185 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
97186 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
97187 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
97188 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
97189 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
97190 + * DEALINGS IN THE SOFTWARE.
97191 + *
97192 + * Copyright (c) 2005, IBM Corporation
97193 + *
97194 + * Author: Stefan Berger, stefanb@us.ibm.com
97195 + * Grant table support: Mahadevan Gomathisankaran
97196 + *
97197 + * This code has been derived from tools/libxc/xen/io/netif.h
97198 + *
97199 + * Copyright (c) 2003-2004, Keir Fraser
97200 + */
97201 +
97202 +#ifndef __XEN_PUBLIC_IO_TPMIF_H__
97203 +#define __XEN_PUBLIC_IO_TPMIF_H__
97204 +
97205 +#include "../grant_table.h"
97206 +
97207 +struct tpmif_tx_request {
97208 +    unsigned long addr;   /* Machine address of packet.   */
97209 +    grant_ref_t ref;      /* grant table access reference */
97210 +    uint16_t unused;
97211 +    uint16_t size;        /* Packet size in bytes.        */
97212 +};
97213 +typedef struct tpmif_tx_request tpmif_tx_request_t;
97214 +
97215 +/*
97216 + * The TPMIF_TX_RING_SIZE defines the number of pages the
97217 + * front-end and backend can exchange (= size of array).
97218 + */
97219 +typedef uint32_t TPMIF_RING_IDX;
97220 +
97221 +#define TPMIF_TX_RING_SIZE 10
97222 +
97223 +/* This structure must fit in a memory page. */
97224 +
97225 +struct tpmif_ring {
97226 +    struct tpmif_tx_request req;
97227 +};
97228 +typedef struct tpmif_ring tpmif_ring_t;
97229 +
97230 +struct tpmif_tx_interface {
97231 +    struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
97232 +};
97233 +typedef struct tpmif_tx_interface tpmif_tx_interface_t;
97234 +
97235 +#endif
97236 +
97237 +/*
97238 + * Local variables:
97239 + * mode: C
97240 + * c-set-style: "BSD"
97241 + * c-basic-offset: 4
97242 + * tab-width: 4
97243 + * indent-tabs-mode: nil
97244 + * End:
97245 + */
97246 diff -ruNp linux-2.6.19/include/xen/interface/io/xenbus.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/xenbus.h
97247 --- linux-2.6.19/include/xen/interface/io/xenbus.h      1970-01-01 00:00:00.000000000 +0000
97248 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/xenbus.h    2007-02-02 19:11:00.000000000 +0000
97249 @@ -0,0 +1,73 @@
97250 +/*****************************************************************************
97251 + * xenbus.h
97252 + *
97253 + * Xenbus protocol details.
97254 + *
97255 + * Permission is hereby granted, free of charge, to any person obtaining a copy
97256 + * of this software and associated documentation files (the "Software"), to
97257 + * deal in the Software without restriction, including without limitation the
97258 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
97259 + * sell copies of the Software, and to permit persons to whom the Software is
97260 + * furnished to do so, subject to the following conditions:
97261 + *
97262 + * The above copyright notice and this permission notice shall be included in
97263 + * all copies or substantial portions of the Software.
97264 + *
97265 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
97266 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
97267 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
97268 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
97269 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
97270 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
97271 + * DEALINGS IN THE SOFTWARE.
97272 + *
97273 + * Copyright (C) 2005 XenSource Ltd.
97274 + */
97275 +
97276 +#ifndef _XEN_PUBLIC_IO_XENBUS_H
97277 +#define _XEN_PUBLIC_IO_XENBUS_H
97278 +
97279 +/*
97280 + * The state of either end of the Xenbus, i.e. the current communication
97281 + * status of initialisation across the bus.  States here imply nothing about
97282 + * the state of the connection between the driver and the kernel's device
97283 + * layers.
97284 + */
97285 +enum xenbus_state {
97286 +    XenbusStateUnknown       = 0,
97287 +
97288 +    XenbusStateInitialising  = 1,
97289 +
97290 +    /*
97291 +     * InitWait: Finished early initialisation but waiting for information
97292 +     * from the peer or hotplug scripts.
97293 +     */
97294 +    XenbusStateInitWait      = 2,
97295 +
97296 +    /*
97297 +     * Initialised: Waiting for a connection from the peer.
97298 +     */
97299 +    XenbusStateInitialised   = 3,
97300 +
97301 +    XenbusStateConnected     = 4,
97302 +
97303 +    /*
97304 +     * Closing: The device is being closed due to an error or an unplug event.
97305 +     */
97306 +    XenbusStateClosing       = 5,
97307 +
97308 +    XenbusStateClosed        = 6
97309 +};
97310 +typedef enum xenbus_state XenbusState;
97311 +
97312 +#endif /* _XEN_PUBLIC_IO_XENBUS_H */
97313 +
97314 +/*
97315 + * Local variables:
97316 + * mode: C
97317 + * c-set-style: "BSD"
97318 + * c-basic-offset: 4
97319 + * tab-width: 4
97320 + * indent-tabs-mode: nil
97321 + * End:
97322 + */
97323 diff -ruNp linux-2.6.19/include/xen/interface/io/xs_wire.h linux-2.6.19-xen-3.0.4/include/xen/interface/io/xs_wire.h
97324 --- linux-2.6.19/include/xen/interface/io/xs_wire.h     1970-01-01 00:00:00.000000000 +0000
97325 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/io/xs_wire.h   2007-02-02 19:11:00.000000000 +0000
97326 @@ -0,0 +1,116 @@
97327 +/*
97328 + * Details of the "wire" protocol between Xen Store Daemon and client
97329 + * library or guest kernel.
97330 + *
97331 + * Permission is hereby granted, free of charge, to any person obtaining a copy
97332 + * of this software and associated documentation files (the "Software"), to
97333 + * deal in the Software without restriction, including without limitation the
97334 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
97335 + * sell copies of the Software, and to permit persons to whom the Software is
97336 + * furnished to do so, subject to the following conditions:
97337 + *
97338 + * The above copyright notice and this permission notice shall be included in
97339 + * all copies or substantial portions of the Software.
97340 + *
97341 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
97342 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
97343 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
97344 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
97345 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
97346 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
97347 + * DEALINGS IN THE SOFTWARE.
97348 + *
97349 + * Copyright (C) 2005 Rusty Russell IBM Corporation
97350 + */
97351 +
97352 +#ifndef _XS_WIRE_H
97353 +#define _XS_WIRE_H
97354 +
97355 +enum xsd_sockmsg_type
97356 +{
97357 +    XS_DEBUG,
97358 +    XS_DIRECTORY,
97359 +    XS_READ,
97360 +    XS_GET_PERMS,
97361 +    XS_WATCH,
97362 +    XS_UNWATCH,
97363 +    XS_TRANSACTION_START,
97364 +    XS_TRANSACTION_END,
97365 +    XS_INTRODUCE,
97366 +    XS_RELEASE,
97367 +    XS_GET_DOMAIN_PATH,
97368 +    XS_WRITE,
97369 +    XS_MKDIR,
97370 +    XS_RM,
97371 +    XS_SET_PERMS,
97372 +    XS_WATCH_EVENT,
97373 +    XS_ERROR,
97374 +    XS_IS_DOMAIN_INTRODUCED
97375 +};
97376 +
97377 +#define XS_WRITE_NONE "NONE"
97378 +#define XS_WRITE_CREATE "CREATE"
97379 +#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
97380 +
97381 +/* We hand errors as strings, for portability. */
97382 +struct xsd_errors
97383 +{
97384 +    int errnum;
97385 +    const char *errstring;
97386 +};
97387 +#define XSD_ERROR(x) { x, #x }
97388 +static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
97389 +    XSD_ERROR(EINVAL),
97390 +    XSD_ERROR(EACCES),
97391 +    XSD_ERROR(EEXIST),
97392 +    XSD_ERROR(EISDIR),
97393 +    XSD_ERROR(ENOENT),
97394 +    XSD_ERROR(ENOMEM),
97395 +    XSD_ERROR(ENOSPC),
97396 +    XSD_ERROR(EIO),
97397 +    XSD_ERROR(ENOTEMPTY),
97398 +    XSD_ERROR(ENOSYS),
97399 +    XSD_ERROR(EROFS),
97400 +    XSD_ERROR(EBUSY),
97401 +    XSD_ERROR(EAGAIN),
97402 +    XSD_ERROR(EISCONN)
97403 +};
97404 +
97405 +struct xsd_sockmsg
97406 +{
97407 +    uint32_t type;  /* XS_??? */
97408 +    uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
97409 +    uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
97410 +    uint32_t len;   /* Length of data following this. */
97411 +
97412 +    /* Generally followed by nul-terminated string(s). */
97413 +};
97414 +
97415 +enum xs_watch_type
97416 +{
97417 +    XS_WATCH_PATH = 0,
97418 +    XS_WATCH_TOKEN
97419 +};
97420 +
97421 +/* Inter-domain shared memory communications. */
97422 +#define XENSTORE_RING_SIZE 1024
97423 +typedef uint32_t XENSTORE_RING_IDX;
97424 +#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
97425 +struct xenstore_domain_interface {
97426 +    char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
97427 +    char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
97428 +    XENSTORE_RING_IDX req_cons, req_prod;
97429 +    XENSTORE_RING_IDX rsp_cons, rsp_prod;
97430 +};
97431 +
97432 +#endif /* _XS_WIRE_H */
97433 +
97434 +/*
97435 + * Local variables:
97436 + * mode: C
97437 + * c-set-style: "BSD"
97438 + * c-basic-offset: 4
97439 + * tab-width: 4
97440 + * indent-tabs-mode: nil
97441 + * End:
97442 + */
97443 diff -ruNp linux-2.6.19/include/xen/interface/kexec.h linux-2.6.19-xen-3.0.4/include/xen/interface/kexec.h
97444 --- linux-2.6.19/include/xen/interface/kexec.h  1970-01-01 00:00:00.000000000 +0000
97445 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/kexec.h        2007-02-02 19:11:00.000000000 +0000
97446 @@ -0,0 +1,137 @@
97447 +/******************************************************************************
97448 + * kexec.h - Public portion
97449 + * 
97450 + * Xen port written by:
97451 + * - Simon 'Horms' Horman <horms@verge.net.au>
97452 + * - Magnus Damm <magnus@valinux.co.jp>
97453 + */
97454 +
97455 +#ifndef _XEN_PUBLIC_KEXEC_H
97456 +#define _XEN_PUBLIC_KEXEC_H
97457 +
97458 +
97459 +/* This file describes the Kexec / Kdump hypercall interface for Xen.
97460 + *
97461 + * Kexec under vanilla Linux allows a user to reboot the physical machine 
97462 + * into a new user-specified kernel. The Xen port extends this idea
97463 + * to allow rebooting of the machine from dom0. When kexec for dom0
97464 + * is used to reboot,  both the hypervisor and the domains get replaced
97465 + * with some other kernel. It is possible to kexec between vanilla
97466 + * Linux and Xen and back again. Xen to Xen works well too.
97467 + *
97468 + * The hypercall interface for kexec can be divided into three main
97469 + * types of hypercall operations:
97470 + *
97471 + * 1) Range information:
97472 + *    This is used by the dom0 kernel to ask the hypervisor about various 
97473 + *    address information. This information is needed to allow kexec-tools 
97474 + *    to fill in the ELF headers for /proc/vmcore properly.
97475 + *
97476 + * 2) Load and unload of images:
97477 + *    There are no big surprises here, the kexec binary from kexec-tools
97478 + *    runs in userspace in dom0. The tool loads/unloads data into the
97479 + *    dom0 kernel such as new kernel, initramfs and hypervisor. When
97480 + *    loaded the dom0 kernel performs a load hypercall operation, and
97481 + *    before releasing all page references the dom0 kernel calls unload.
97482 + *
97483 + * 3) Kexec operation:
97484 + *    This is used to start a previously loaded kernel.
97485 + */
97486 +
97487 +#include "xen.h"
97488 +
97489 +#if defined(__i386__) || defined(__x86_64__)
97490 +#define KEXEC_XEN_NO_PAGES 17
97491 +#endif
97492 +
97493 +/*
97494 + * Prototype for this hypercall is:
97495 + *  int kexec_op(int cmd, void *args)
97496 + * @cmd  == KEXEC_CMD_... 
97497 + *          KEXEC operation to perform
97498 + * @args == Operation-specific extra arguments (NULL if none).
97499 + */
97500 +
97501 +/*
97502 + * Kexec supports two types of operation:
97503 + * - kexec into a regular kernel, very similar to a standard reboot
97504 + *   - KEXEC_TYPE_DEFAULT is used to specify this type
97505 + * - kexec into a special "crash kernel", aka kexec-on-panic
97506 + *   - KEXEC_TYPE_CRASH is used to specify this type
97507 + *   - parts of our system may be broken at kexec-on-panic time
97508 + *     - the code should be kept as simple and self-contained as possible
97509 + */
97510 +
97511 +#define KEXEC_TYPE_DEFAULT 0
97512 +#define KEXEC_TYPE_CRASH   1
97513 +
97514 +
97515 +/* The kexec implementation for Xen allows the user to load two
97516 + * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
97517 + * All data needed for a kexec reboot is kept in one xen_kexec_image_t
97518 + * per "instance". The data mainly consists of machine address lists to pages
97519 + * together with destination addresses. The data in xen_kexec_image_t
97520 + * is passed to the "code page" which is one page of code that performs
97521 + * the final relocations before jumping to the new kernel.
97522 + */
97523
97524 +typedef struct xen_kexec_image {
97525 +#if defined(__i386__) || defined(__x86_64__)
97526 +    unsigned long page_list[KEXEC_XEN_NO_PAGES];
97527 +#endif
97528 +    unsigned long indirection_page;
97529 +    unsigned long start_address;
97530 +} xen_kexec_image_t;
97531 +
97532 +/*
97533 + * Perform kexec having previously loaded a kexec or kdump kernel
97534 + * as appropriate.
97535 + * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
97536 + */
97537 +#define KEXEC_CMD_kexec                 0
97538 +typedef struct xen_kexec_exec {
97539 +    int type;
97540 +} xen_kexec_exec_t;
97541 +
97542 +/*
97543 + * Load/Unload kernel image for kexec or kdump.
97544 + * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
97545 + * image == relocation information for kexec (ignored for unload) [in]
97546 + */
97547 +#define KEXEC_CMD_kexec_load            1
97548 +#define KEXEC_CMD_kexec_unload          2
97549 +typedef struct xen_kexec_load {
97550 +    int type;
97551 +    xen_kexec_image_t image;
97552 +} xen_kexec_load_t;
97553 +
97554 +#define KEXEC_RANGE_MA_CRASH 0   /* machine address and size of crash area */
97555 +#define KEXEC_RANGE_MA_XEN   1   /* machine address and size of Xen itself */
97556 +#define KEXEC_RANGE_MA_CPU   2   /* machine address and size of a CPU note */
97557 +
97558 +/*
97559 + * Find the address and size of certain memory areas
97560 + * range == KEXEC_RANGE_... [in]
97561 + * nr    == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
97562 + * size  == number of bytes reserved in window [out]
97563 + * start == address of the first byte in the window [out]
97564 + */
97565 +#define KEXEC_CMD_kexec_get_range       3
97566 +typedef struct xen_kexec_range {
97567 +    int range;
97568 +    int nr;
97569 +    unsigned long size;
97570 +    unsigned long start;
97571 +} xen_kexec_range_t;
97572 +
97573 +#endif /* _XEN_PUBLIC_KEXEC_H */
97574 +
97575 +/*
97576 + * Local variables:
97577 + * mode: C
97578 + * c-set-style: "BSD"
97579 + * c-basic-offset: 4
97580 + * tab-width: 4
97581 + * indent-tabs-mode: nil
97582 + * End:
97583 + */
97584 diff -ruNp linux-2.6.19/include/xen/interface/memory.h linux-2.6.19-xen-3.0.4/include/xen/interface/memory.h
97585 --- linux-2.6.19/include/xen/interface/memory.h 1970-01-01 00:00:00.000000000 +0000
97586 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/memory.h       2007-02-02 19:11:00.000000000 +0000
97587 @@ -0,0 +1,276 @@
97588 +/******************************************************************************
97589 + * memory.h
97590 + * 
97591 + * Memory reservation and information.
97592 + * 
97593 + * Permission is hereby granted, free of charge, to any person obtaining a copy
97594 + * of this software and associated documentation files (the "Software"), to
97595 + * deal in the Software without restriction, including without limitation the
97596 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
97597 + * sell copies of the Software, and to permit persons to whom the Software is
97598 + * furnished to do so, subject to the following conditions:
97599 + *
97600 + * The above copyright notice and this permission notice shall be included in
97601 + * all copies or substantial portions of the Software.
97602 + *
97603 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
97604 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
97605 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
97606 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
97607 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
97608 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
97609 + * DEALINGS IN THE SOFTWARE.
97610 + *
97611 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
97612 + */
97613 +
97614 +#ifndef __XEN_PUBLIC_MEMORY_H__
97615 +#define __XEN_PUBLIC_MEMORY_H__
97616 +
97617 +/*
97618 + * Increase or decrease the specified domain's memory reservation. Returns the
97619 + * number of extents successfully allocated or freed.
97620 + * arg == addr of struct xen_memory_reservation.
97621 + */
97622 +#define XENMEM_increase_reservation 0
97623 +#define XENMEM_decrease_reservation 1
97624 +#define XENMEM_populate_physmap     6
97625 +struct xen_memory_reservation {
97626 +
97627 +    /*
97628 +     * XENMEM_increase_reservation:
97629 +     *   OUT: MFN (*not* GMFN) bases of extents that were allocated
97630 +     * XENMEM_decrease_reservation:
97631 +     *   IN:  GMFN bases of extents to free
97632 +     * XENMEM_populate_physmap:
97633 +     *   IN:  GPFN bases of extents to populate with memory
97634 +     *   OUT: GMFN bases of extents that were allocated
97635 +     *   (NB. This command also updates the mach_to_phys translation table)
97636 +     */
97637 +    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
97638 +
97639 +    /* Number of extents, and size/alignment of each (2^extent_order pages). */
97640 +    xen_ulong_t    nr_extents;
97641 +    unsigned int   extent_order;
97642 +
97643 +    /*
97644 +     * Maximum # bits addressable by the user of the allocated region (e.g., 
97645 +     * I/O devices often have a 32-bit limitation even in 64-bit systems). If 
97646 +     * zero then the user has no addressing restriction.
97647 +     * This field is not used by XENMEM_decrease_reservation.
97648 +     */
97649 +    unsigned int   address_bits;
97650 +
97651 +    /*
97652 +     * Domain whose reservation is being changed.
97653 +     * Unprivileged domains can specify only DOMID_SELF.
97654 +     */
97655 +    domid_t        domid;
97656 +};
97657 +typedef struct xen_memory_reservation xen_memory_reservation_t;
97658 +DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
97659 +
97660 +/*
97661 + * An atomic exchange of memory pages. If return code is zero then
97662 + * @out.extent_list provides GMFNs of the newly-allocated memory.
97663 + * Returns zero on complete success, otherwise a negative error code.
97664 + * On complete success then always @nr_exchanged == @in.nr_extents.
97665 + * On partial success @nr_exchanged indicates how much work was done.
97666 + */
97667 +#define XENMEM_exchange             11
97668 +struct xen_memory_exchange {
97669 +    /*
97670 +     * [IN] Details of memory extents to be exchanged (GMFN bases).
97671 +     * Note that @in.address_bits is ignored and unused.
97672 +     */
97673 +    struct xen_memory_reservation in;
97674 +
97675 +    /*
97676 +     * [IN/OUT] Details of new memory extents.
97677 +     * We require that:
97678 +     *  1. @in.domid == @out.domid
97679 +     *  2. @in.nr_extents  << @in.extent_order == 
97680 +     *     @out.nr_extents << @out.extent_order
97681 +     *  3. @in.extent_start and @out.extent_start lists must not overlap
97682 +     *  4. @out.extent_start lists GPFN bases to be populated
97683 +     *  5. @out.extent_start is overwritten with allocated GMFN bases
97684 +     */
97685 +    struct xen_memory_reservation out;
97686 +
97687 +    /*
97688 +     * [OUT] Number of input extents that were successfully exchanged:
97689 +     *  1. The first @nr_exchanged input extents were successfully
97690 +     *     deallocated.
97691 +     *  2. The corresponding first entries in the output extent list correctly
97692 +     *     indicate the GMFNs that were successfully exchanged.
97693 +     *  3. All other input and output extents are untouched.
97694 +     *  4. If not all input exents are exchanged then the return code of this
97695 +     *     command will be non-zero.
97696 +     *  5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
97697 +     */
97698 +    xen_ulong_t nr_exchanged;
97699 +};
97700 +typedef struct xen_memory_exchange xen_memory_exchange_t;
97701 +DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
97702 +
97703 +/*
97704 + * Returns the maximum machine frame number of mapped RAM in this system.
97705 + * This command always succeeds (it never returns an error code).
97706 + * arg == NULL.
97707 + */
97708 +#define XENMEM_maximum_ram_page     2
97709 +
97710 +/*
97711 + * Returns the current or maximum memory reservation, in pages, of the
97712 + * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
97713 + * arg == addr of domid_t.
97714 + */
97715 +#define XENMEM_current_reservation  3
97716 +#define XENMEM_maximum_reservation  4
97717 +
97718 +/*
97719 + * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
97720 + * mapping table. Architectures which do not have a m2p table do not implement
97721 + * this command.
97722 + * arg == addr of xen_machphys_mfn_list_t.
97723 + */
97724 +#define XENMEM_machphys_mfn_list    5
97725 +struct xen_machphys_mfn_list {
97726 +    /*
97727 +     * Size of the 'extent_start' array. Fewer entries will be filled if the
97728 +     * machphys table is smaller than max_extents * 2MB.
97729 +     */
97730 +    unsigned int max_extents;
97731 +
97732 +    /*
97733 +     * Pointer to buffer to fill with list of extent starts. If there are
97734 +     * any large discontiguities in the machine address space, 2MB gaps in
97735 +     * the machphys table will be represented by an MFN base of zero.
97736 +     */
97737 +    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
97738 +
97739 +    /*
97740 +     * Number of extents written to the above array. This will be smaller
97741 +     * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
97742 +     */
97743 +    unsigned int nr_extents;
97744 +};
97745 +typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
97746 +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
97747 +
97748 +/*
97749 + * Returns the location in virtual address space of the machine_to_phys
97750 + * mapping table. Architectures which do not have a m2p table, or which do not
97751 + * map it by default into guest address space, do not implement this command.
97752 + * arg == addr of xen_machphys_mapping_t.
97753 + */
97754 +#define XENMEM_machphys_mapping     12
97755 +struct xen_machphys_mapping {
97756 +    xen_ulong_t v_start, v_end; /* Start and end virtual addresses.   */
97757 +    xen_ulong_t max_mfn;        /* Maximum MFN that can be looked up. */
97758 +};
97759 +typedef struct xen_machphys_mapping xen_machphys_mapping_t;
97760 +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
97761 +
97762 +/*
97763 + * Sets the GPFN at which a particular page appears in the specified guest's
97764 + * pseudophysical address space.
97765 + * arg == addr of xen_add_to_physmap_t.
97766 + */
97767 +#define XENMEM_add_to_physmap      7
97768 +struct xen_add_to_physmap {
97769 +    /* Which domain to change the mapping for. */
97770 +    domid_t domid;
97771 +
97772 +    /* Source mapping space. */
97773 +#define XENMAPSPACE_shared_info 0 /* shared info page */
97774 +#define XENMAPSPACE_grant_table 1 /* grant table page */
97775 +    unsigned int space;
97776 +
97777 +    /* Index into source mapping space. */
97778 +    xen_ulong_t idx;
97779 +
97780 +    /* GPFN where the source mapping page should appear. */
97781 +    xen_pfn_t     gpfn;
97782 +};
97783 +typedef struct xen_add_to_physmap xen_add_to_physmap_t;
97784 +DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
97785 +
97786 +/*
97787 + * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
97788 + * code on failure. This call only works for auto-translated guests.
97789 + */
97790 +#define XENMEM_translate_gpfn_list  8
97791 +struct xen_translate_gpfn_list {
97792 +    /* Which domain to translate for? */
97793 +    domid_t domid;
97794 +
97795 +    /* Length of list. */
97796 +    xen_ulong_t nr_gpfns;
97797 +
97798 +    /* List of GPFNs to translate. */
97799 +    XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
97800 +
97801 +    /*
97802 +     * Output list to contain MFN translations. May be the same as the input
97803 +     * list (in which case each input GPFN is overwritten with the output MFN).
97804 +     */
97805 +    XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
97806 +};
97807 +typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
97808 +DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
97809 +
97810 +/*
97811 + * Returns the pseudo-physical memory map as it was when the domain
97812 + * was started (specified by XENMEM_set_memory_map).
97813 + * arg == addr of xen_memory_map_t.
97814 + */
97815 +#define XENMEM_memory_map           9
97816 +struct xen_memory_map {
97817 +    /*
97818 +     * On call the number of entries which can be stored in buffer. On
97819 +     * return the number of entries which have been stored in
97820 +     * buffer.
97821 +     */
97822 +    unsigned int nr_entries;
97823 +
97824 +    /*
97825 +     * Entries in the buffer are in the same format as returned by the
97826 +     * BIOS INT 0x15 EAX=0xE820 call.
97827 +     */
97828 +    XEN_GUEST_HANDLE(void) buffer;
97829 +};
97830 +typedef struct xen_memory_map xen_memory_map_t;
97831 +DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
97832 +
97833 +/*
97834 + * Returns the real physical memory map. Passes the same structure as
97835 + * XENMEM_memory_map.
97836 + * arg == addr of xen_memory_map_t.
97837 + */
97838 +#define XENMEM_machine_memory_map   10
97839 +
97840 +/*
97841 + * Set the pseudo-physical memory map of a domain, as returned by
97842 + * XENMEM_memory_map.
97843 + * arg == addr of xen_foreign_memory_map_t.
97844 + */
97845 +#define XENMEM_set_memory_map       13
97846 +struct xen_foreign_memory_map {
97847 +    domid_t domid;
97848 +    struct xen_memory_map map;
97849 +};
97850 +typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
97851 +DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
97852 +
97853 +#endif /* __XEN_PUBLIC_MEMORY_H__ */
97854 +
97855 +/*
97856 + * Local variables:
97857 + * mode: C
97858 + * c-set-style: "BSD"
97859 + * c-basic-offset: 4
97860 + * tab-width: 4
97861 + * indent-tabs-mode: nil
97862 + * End:
97863 + */
97864 diff -ruNp linux-2.6.19/include/xen/interface/nmi.h linux-2.6.19-xen-3.0.4/include/xen/interface/nmi.h
97865 --- linux-2.6.19/include/xen/interface/nmi.h    1970-01-01 00:00:00.000000000 +0000
97866 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/nmi.h  2007-02-02 19:11:00.000000000 +0000
97867 @@ -0,0 +1,78 @@
97868 +/******************************************************************************
97869 + * nmi.h
97870 + * 
97871 + * NMI callback registration and reason codes.
97872 + * 
97873 + * Permission is hereby granted, free of charge, to any person obtaining a copy
97874 + * of this software and associated documentation files (the "Software"), to
97875 + * deal in the Software without restriction, including without limitation the
97876 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
97877 + * sell copies of the Software, and to permit persons to whom the Software is
97878 + * furnished to do so, subject to the following conditions:
97879 + *
97880 + * The above copyright notice and this permission notice shall be included in
97881 + * all copies or substantial portions of the Software.
97882 + *
97883 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
97884 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
97885 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
97886 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
97887 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
97888 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
97889 + * DEALINGS IN THE SOFTWARE.
97890 + *
97891 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
97892 + */
97893 +
97894 +#ifndef __XEN_PUBLIC_NMI_H__
97895 +#define __XEN_PUBLIC_NMI_H__
97896 +
97897 +/*
97898 + * NMI reason codes:
97899 + * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
97900 + */
97901 + /* I/O-check error reported via ISA port 0x61, bit 6. */
97902 +#define _XEN_NMIREASON_io_error     0
97903 +#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
97904 + /* Parity error reported via ISA port 0x61, bit 7. */
97905 +#define _XEN_NMIREASON_parity_error 1
97906 +#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
97907 + /* Unknown hardware-generated NMI. */
97908 +#define _XEN_NMIREASON_unknown      2
97909 +#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
97910 +
97911 +/*
97912 + * long nmi_op(unsigned int cmd, void *arg)
97913 + * NB. All ops return zero on success, else a negative error code.
97914 + */
97915 +
97916 +/*
97917 + * Register NMI callback for this (calling) VCPU. Currently this only makes
97918 + * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
97919 + * arg == pointer to xennmi_callback structure.
97920 + */
97921 +#define XENNMI_register_callback   0
97922 +struct xennmi_callback {
97923 +    unsigned long handler_address;
97924 +    unsigned long pad;
97925 +};
97926 +typedef struct xennmi_callback xennmi_callback_t;
97927 +DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
97928 +
97929 +/*
97930 + * Deregister NMI callback for this (calling) VCPU.
97931 + * arg == NULL.
97932 + */
97933 +#define XENNMI_unregister_callback 1
97934 +
97935 +#endif /* __XEN_PUBLIC_NMI_H__ */
97936 +
97937 +/*
97938 + * Local variables:
97939 + * mode: C
97940 + * c-set-style: "BSD"
97941 + * c-basic-offset: 4
97942 + * tab-width: 4
97943 + * indent-tabs-mode: nil
97944 + * End:
97945 + */
97946 diff -ruNp linux-2.6.19/include/xen/interface/physdev.h linux-2.6.19-xen-3.0.4/include/xen/interface/physdev.h
97947 --- linux-2.6.19/include/xen/interface/physdev.h        1970-01-01 00:00:00.000000000 +0000
97948 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/physdev.h      2007-02-02 19:11:00.000000000 +0000
97949 @@ -0,0 +1,169 @@
97950 +/*
97951 + * Permission is hereby granted, free of charge, to any person obtaining a copy
97952 + * of this software and associated documentation files (the "Software"), to
97953 + * deal in the Software without restriction, including without limitation the
97954 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
97955 + * sell copies of the Software, and to permit persons to whom the Software is
97956 + * furnished to do so, subject to the following conditions:
97957 + *
97958 + * The above copyright notice and this permission notice shall be included in
97959 + * all copies or substantial portions of the Software.
97960 + *
97961 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
97962 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
97963 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
97964 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
97965 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
97966 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
97967 + * DEALINGS IN THE SOFTWARE.
97968 + */
97969 +
97970 +#ifndef __XEN_PUBLIC_PHYSDEV_H__
97971 +#define __XEN_PUBLIC_PHYSDEV_H__
97972 +
97973 +/*
97974 + * Prototype for this hypercall is:
97975 + *  int physdev_op(int cmd, void *args)
97976 + * @cmd  == PHYSDEVOP_??? (physdev operation).
97977 + * @args == Operation-specific extra arguments (NULL if none).
97978 + */
97979 +
97980 +/*
97981 + * Notify end-of-interrupt (EOI) for the specified IRQ.
97982 + * @arg == pointer to physdev_eoi structure.
97983 + */
97984 +#define PHYSDEVOP_eoi                   12
97985 +struct physdev_eoi {
97986 +    /* IN */
97987 +    uint32_t irq;
97988 +};
97989 +typedef struct physdev_eoi physdev_eoi_t;
97990 +DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
97991 +
97992 +/*
97993 + * Query the status of an IRQ line.
97994 + * @arg == pointer to physdev_irq_status_query structure.
97995 + */
97996 +#define PHYSDEVOP_irq_status_query       5
97997 +struct physdev_irq_status_query {
97998 +    /* IN */
97999 +    uint32_t irq;
98000 +    /* OUT */
98001 +    uint32_t flags; /* XENIRQSTAT_* */
98002 +};
98003 +typedef struct physdev_irq_status_query physdev_irq_status_query_t;
98004 +DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
98005 +
98006 +/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
98007 +#define _XENIRQSTAT_needs_eoi   (0)
98008 +#define  XENIRQSTAT_needs_eoi   (1U<<_XENIRQSTAT_needs_eoi)
98009 +
98010 +/* IRQ shared by multiple guests? */
98011 +#define _XENIRQSTAT_shared      (1)
98012 +#define  XENIRQSTAT_shared      (1U<<_XENIRQSTAT_shared)
98013 +
98014 +/*
98015 + * Set the current VCPU's I/O privilege level.
98016 + * @arg == pointer to physdev_set_iopl structure.
98017 + */
98018 +#define PHYSDEVOP_set_iopl               6
98019 +struct physdev_set_iopl {
98020 +    /* IN */
98021 +    uint32_t iopl;
98022 +};
98023 +typedef struct physdev_set_iopl physdev_set_iopl_t;
98024 +DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
98025 +
98026 +/*
98027 + * Set the current VCPU's I/O-port permissions bitmap.
98028 + * @arg == pointer to physdev_set_iobitmap structure.
98029 + */
98030 +#define PHYSDEVOP_set_iobitmap           7
98031 +struct physdev_set_iobitmap {
98032 +    /* IN */
98033 +    XEN_GUEST_HANDLE_00030205(uint8_t) bitmap;
98034 +    uint32_t nr_ports;
98035 +};
98036 +typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
98037 +DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
98038 +
98039 +/*
98040 + * Read or write an IO-APIC register.
98041 + * @arg == pointer to physdev_apic structure.
98042 + */
98043 +#define PHYSDEVOP_apic_read              8
98044 +#define PHYSDEVOP_apic_write             9
98045 +struct physdev_apic {
98046 +    /* IN */
98047 +    unsigned long apic_physbase;
98048 +    uint32_t reg;
98049 +    /* IN or OUT */
98050 +    uint32_t value;
98051 +};
98052 +typedef struct physdev_apic physdev_apic_t;
98053 +DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
98054 +
98055 +/*
98056 + * Allocate or free a physical upcall vector for the specified IRQ line.
98057 + * @arg == pointer to physdev_irq structure.
98058 + */
98059 +#define PHYSDEVOP_alloc_irq_vector      10
98060 +#define PHYSDEVOP_free_irq_vector       11
98061 +struct physdev_irq {
98062 +    /* IN */
98063 +    uint32_t irq;
98064 +    /* IN or OUT */
98065 +    uint32_t vector;
98066 +};
98067 +typedef struct physdev_irq physdev_irq_t;
98068 +DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
98069 +
98070 +/*
98071 + * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
98072 + * hypercall since 0x00030202.
98073 + */
98074 +struct physdev_op {
98075 +    uint32_t cmd;
98076 +    union {
98077 +        struct physdev_irq_status_query      irq_status_query;
98078 +        struct physdev_set_iopl              set_iopl;
98079 +        struct physdev_set_iobitmap          set_iobitmap;
98080 +        struct physdev_apic                  apic_op;
98081 +        struct physdev_irq                   irq_op;
98082 +    } u;
98083 +};
98084 +typedef struct physdev_op physdev_op_t;
98085 +DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
98086 +
98087 +/*
98088 + * Notify that some PIRQ-bound event channels have been unmasked.
98089 + * ** This command is obsolete since interface version 0x00030202 and is **
98090 + * ** unsupported by newer versions of Xen.                              **
98091 + */
98092 +#define PHYSDEVOP_IRQ_UNMASK_NOTIFY      4
98093 +
98094 +/*
98095 + * These all-capitals physdev operation names are superceded by the new names
98096 + * (defined above) since interface version 0x00030202.
98097 + */
98098 +#define PHYSDEVOP_IRQ_STATUS_QUERY       PHYSDEVOP_irq_status_query
98099 +#define PHYSDEVOP_SET_IOPL               PHYSDEVOP_set_iopl
98100 +#define PHYSDEVOP_SET_IOBITMAP           PHYSDEVOP_set_iobitmap
98101 +#define PHYSDEVOP_APIC_READ              PHYSDEVOP_apic_read
98102 +#define PHYSDEVOP_APIC_WRITE             PHYSDEVOP_apic_write
98103 +#define PHYSDEVOP_ASSIGN_VECTOR          PHYSDEVOP_alloc_irq_vector
98104 +#define PHYSDEVOP_FREE_VECTOR            PHYSDEVOP_free_irq_vector
98105 +#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
98106 +#define PHYSDEVOP_IRQ_SHARED             XENIRQSTAT_shared
98107 +
98108 +#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
98109 +
98110 +/*
98111 + * Local variables:
98112 + * mode: C
98113 + * c-set-style: "BSD"
98114 + * c-basic-offset: 4
98115 + * tab-width: 4
98116 + * indent-tabs-mode: nil
98117 + * End:
98118 + */
98119 diff -ruNp linux-2.6.19/include/xen/interface/platform.h linux-2.6.19-xen-3.0.4/include/xen/interface/platform.h
98120 --- linux-2.6.19/include/xen/interface/platform.h       1970-01-01 00:00:00.000000000 +0000
98121 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/platform.h     2007-02-02 19:11:00.000000000 +0000
98122 @@ -0,0 +1,143 @@
98123 +/******************************************************************************
98124 + * platform.h
98125 + * 
98126 + * Hardware platform operations. Intended for use by domain-0 kernel.
98127 + * 
98128 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98129 + * of this software and associated documentation files (the "Software"), to
98130 + * deal in the Software without restriction, including without limitation the
98131 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98132 + * sell copies of the Software, and to permit persons to whom the Software is
98133 + * furnished to do so, subject to the following conditions:
98134 + *
98135 + * The above copyright notice and this permission notice shall be included in
98136 + * all copies or substantial portions of the Software.
98137 + *
98138 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98139 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98140 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98141 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98142 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98143 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98144 + * DEALINGS IN THE SOFTWARE.
98145 + *
98146 + * Copyright (c) 2002-2006, K Fraser
98147 + */
98148 +
98149 +#ifndef __XEN_PUBLIC_PLATFORM_H__
98150 +#define __XEN_PUBLIC_PLATFORM_H__
98151 +
98152 +#include "xen.h"
98153 +
98154 +#define XENPF_INTERFACE_VERSION 0x03000001
98155 +
98156 +/*
98157 + * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
98158 + * 1 January, 1970 if the current system time was <system_time>.
98159 + */
98160 +#define XENPF_settime             17
98161 +struct xenpf_settime {
98162 +    /* IN variables. */
98163 +    uint32_t secs;
98164 +    uint32_t nsecs;
98165 +    uint64_t system_time;
98166 +};
98167 +typedef struct xenpf_settime xenpf_settime_t;
98168 +DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
98169 +
98170 +/*
98171 + * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
98172 + * On x86, @type is an architecture-defined MTRR memory type.
98173 + * On success, returns the MTRR that was used (@reg) and a handle that can
98174 + * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
98175 + * (x86-specific).
98176 + */
98177 +#define XENPF_add_memtype         31
98178 +struct xenpf_add_memtype {
98179 +    /* IN variables. */
98180 +    xen_pfn_t mfn;
98181 +    uint64_t nr_mfns;
98182 +    uint32_t type;
98183 +    /* OUT variables. */
98184 +    uint32_t handle;
98185 +    uint32_t reg;
98186 +};
98187 +typedef struct xenpf_add_memtype xenpf_add_memtype_t;
98188 +DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t);
98189 +
98190 +/*
98191 + * Tear down an existing memory-range type. If @handle is remembered then it
98192 + * should be passed in to accurately tear down the correct setting (in case
98193 + * of overlapping memory regions with differing types). If it is not known
98194 + * then @handle should be set to zero. In all cases @reg must be set.
98195 + * (x86-specific).
98196 + */
98197 +#define XENPF_del_memtype         32
98198 +struct xenpf_del_memtype {
98199 +    /* IN variables. */
98200 +    uint32_t handle;
98201 +    uint32_t reg;
98202 +};
98203 +typedef struct xenpf_del_memtype xenpf_del_memtype_t;
98204 +DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t);
98205 +
98206 +/* Read current type of an MTRR (x86-specific). */
98207 +#define XENPF_read_memtype        33
98208 +struct xenpf_read_memtype {
98209 +    /* IN variables. */
98210 +    uint32_t reg;
98211 +    /* OUT variables. */
98212 +    xen_pfn_t mfn;
98213 +    uint64_t nr_mfns;
98214 +    uint32_t type;
98215 +};
98216 +typedef struct xenpf_read_memtype xenpf_read_memtype_t;
98217 +DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t);
98218 +
98219 +#define XENPF_microcode_update    35
98220 +struct xenpf_microcode_update {
98221 +    /* IN variables. */
98222 +    XEN_GUEST_HANDLE(void) data;      /* Pointer to microcode data */
98223 +    uint32_t length;                  /* Length of microcode data. */
98224 +};
98225 +typedef struct xenpf_microcode_update xenpf_microcode_update_t;
98226 +DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t);
98227 +
98228 +#define XENPF_platform_quirk      39
98229 +#define QUIRK_NOIRQBALANCING      1 /* Do not restrict IO-APIC RTE targets */
98230 +#define QUIRK_IOAPIC_BAD_REGSEL   2 /* IO-APIC REGSEL forgets its value    */
98231 +#define QUIRK_IOAPIC_GOOD_REGSEL  3 /* IO-APIC REGSEL behaves properly     */
98232 +struct xenpf_platform_quirk {
98233 +    /* IN variables. */
98234 +    uint32_t quirk_id;
98235 +};
98236 +typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
98237 +DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
98238 +
98239 +struct xen_platform_op {
98240 +    uint32_t cmd;
98241 +    uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
98242 +    union {
98243 +        struct xenpf_settime           settime;
98244 +        struct xenpf_add_memtype       add_memtype;
98245 +        struct xenpf_del_memtype       del_memtype;
98246 +        struct xenpf_read_memtype      read_memtype;
98247 +        struct xenpf_microcode_update  microcode;
98248 +        struct xenpf_platform_quirk    platform_quirk;
98249 +        uint8_t                        pad[128];
98250 +    } u;
98251 +};
98252 +typedef struct xen_platform_op xen_platform_op_t;
98253 +DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t);
98254 +
98255 +#endif /* __XEN_PUBLIC_PLATFORM_H__ */
98256 +
98257 +/*
98258 + * Local variables:
98259 + * mode: C
98260 + * c-set-style: "BSD"
98261 + * c-basic-offset: 4
98262 + * tab-width: 4
98263 + * indent-tabs-mode: nil
98264 + * End:
98265 + */
98266 diff -ruNp linux-2.6.19/include/xen/interface/sched.h linux-2.6.19-xen-3.0.4/include/xen/interface/sched.h
98267 --- linux-2.6.19/include/xen/interface/sched.h  1970-01-01 00:00:00.000000000 +0000
98268 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/sched.h        2007-02-02 19:11:00.000000000 +0000
98269 @@ -0,0 +1,121 @@
98270 +/******************************************************************************
98271 + * sched.h
98272 + * 
98273 + * Scheduler state interactions
98274 + * 
98275 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98276 + * of this software and associated documentation files (the "Software"), to
98277 + * deal in the Software without restriction, including without limitation the
98278 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98279 + * sell copies of the Software, and to permit persons to whom the Software is
98280 + * furnished to do so, subject to the following conditions:
98281 + *
98282 + * The above copyright notice and this permission notice shall be included in
98283 + * all copies or substantial portions of the Software.
98284 + *
98285 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98286 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98287 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98288 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98289 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98290 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98291 + * DEALINGS IN THE SOFTWARE.
98292 + *
98293 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
98294 + */
98295 +
98296 +#ifndef __XEN_PUBLIC_SCHED_H__
98297 +#define __XEN_PUBLIC_SCHED_H__
98298 +
98299 +#include "event_channel.h"
98300 +
98301 +/*
98302 + * The prototype for this hypercall is:
98303 + *  long sched_op(int cmd, void *arg)
98304 + * @cmd == SCHEDOP_??? (scheduler operation).
98305 + * @arg == Operation-specific extra argument(s), as described below.
98306 + * 
98307 + * Versions of Xen prior to 3.0.2 provided only the following legacy version
98308 + * of this hypercall, supporting only the commands yield, block and shutdown:
98309 + *  long sched_op(int cmd, unsigned long arg)
98310 + * @cmd == SCHEDOP_??? (scheduler operation).
98311 + * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
98312 + *      == SHUTDOWN_* code (SCHEDOP_shutdown)
98313 + * This legacy version is available to new guests as sched_op_compat().
98314 + */
98315 +
98316 +/*
98317 + * Voluntarily yield the CPU.
98318 + * @arg == NULL.
98319 + */
98320 +#define SCHEDOP_yield       0
98321 +
98322 +/*
98323 + * Block execution of this VCPU until an event is received for processing.
98324 + * If called with event upcalls masked, this operation will atomically
98325 + * reenable event delivery and check for pending events before blocking the
98326 + * VCPU. This avoids a "wakeup waiting" race.
98327 + * @arg == NULL.
98328 + */
98329 +#define SCHEDOP_block       1
98330 +
98331 +/*
98332 + * Halt execution of this domain (all VCPUs) and notify the system controller.
98333 + * @arg == pointer to sched_shutdown structure.
98334 + */
98335 +#define SCHEDOP_shutdown    2
98336 +struct sched_shutdown {
98337 +    unsigned int reason; /* SHUTDOWN_* */
98338 +};
98339 +typedef struct sched_shutdown sched_shutdown_t;
98340 +DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
98341 +
98342 +/*
98343 + * Poll a set of event-channel ports. Return when one or more are pending. An
98344 + * optional timeout may be specified.
98345 + * @arg == pointer to sched_poll structure.
98346 + */
98347 +#define SCHEDOP_poll        3
98348 +struct sched_poll {
98349 +    XEN_GUEST_HANDLE(evtchn_port_t) ports;
98350 +    unsigned int nr_ports;
98351 +    uint64_t timeout;
98352 +};
98353 +typedef struct sched_poll sched_poll_t;
98354 +DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
98355 +
98356 +/*
98357 + * Declare a shutdown for another domain. The main use of this function is
98358 + * in interpreting shutdown requests and reasons for fully-virtualized
98359 + * domains.  A para-virtualized domain may use SCHEDOP_shutdown directly.
98360 + * @arg == pointer to sched_remote_shutdown structure.
98361 + */
98362 +#define SCHEDOP_remote_shutdown        4
98363 +struct sched_remote_shutdown {
98364 +    domid_t domain_id;         /* Remote domain ID */
98365 +    unsigned int reason;       /* SHUTDOWN_xxx reason */
98366 +};
98367 +typedef struct sched_remote_shutdown sched_remote_shutdown_t;
98368 +DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
98369 +
98370 +/*
98371 + * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
98372 + * software to determine the appropriate action. For the most part, Xen does
98373 + * not care about the shutdown code.
98374 + */
98375 +#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
98376 +#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
98377 +#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
98378 +#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
98379 +
98380 +#endif /* __XEN_PUBLIC_SCHED_H__ */
98381 +
98382 +/*
98383 + * Local variables:
98384 + * mode: C
98385 + * c-set-style: "BSD"
98386 + * c-basic-offset: 4
98387 + * tab-width: 4
98388 + * indent-tabs-mode: nil
98389 + * End:
98390 + */
98391 diff -ruNp linux-2.6.19/include/xen/interface/sysctl.h linux-2.6.19-xen-3.0.4/include/xen/interface/sysctl.h
98392 --- linux-2.6.19/include/xen/interface/sysctl.h 1970-01-01 00:00:00.000000000 +0000
98393 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/sysctl.h       2007-02-02 19:11:00.000000000 +0000
98394 @@ -0,0 +1,169 @@
98395 +/******************************************************************************
98396 + * sysctl.h
98397 + * 
98398 + * System management operations. For use by node control stack.
98399 + * 
98400 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98401 + * of this software and associated documentation files (the "Software"), to
98402 + * deal in the Software without restriction, including without limitation the
98403 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98404 + * sell copies of the Software, and to permit persons to whom the Software is
98405 + * furnished to do so, subject to the following conditions:
98406 + *
98407 + * The above copyright notice and this permission notice shall be included in
98408 + * all copies or substantial portions of the Software.
98409 + *
98410 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98411 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98412 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98413 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98414 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98415 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98416 + * DEALINGS IN THE SOFTWARE.
98417 + *
98418 + * Copyright (c) 2002-2006, K Fraser
98419 + */
98420 +
98421 +#ifndef __XEN_PUBLIC_SYSCTL_H__
98422 +#define __XEN_PUBLIC_SYSCTL_H__
98423 +
98424 +#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
98425 +#error "sysctl operations are intended for use by node control tools only"
98426 +#endif
98427 +
98428 +#include "xen.h"
98429 +#include "domctl.h"
98430 +
98431 +#define XEN_SYSCTL_INTERFACE_VERSION 0x00000002
98432 +
98433 +/*
98434 + * Read console content from Xen buffer ring.
98435 + */
98436 +#define XEN_SYSCTL_readconsole       1
98437 +struct xen_sysctl_readconsole {
98438 +    /* IN variables. */
98439 +    uint32_t clear;                /* Non-zero -> clear after reading. */
98440 +    XEN_GUEST_HANDLE(char) buffer; /* Buffer start */
98441 +    /* IN/OUT variables. */
98442 +    uint32_t count;            /* In: Buffer size;  Out: Used buffer size  */
98443 +};
98444 +typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t;
98445 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t);
98446 +
98447 +/* Get trace buffers machine base address */
98448 +#define XEN_SYSCTL_tbuf_op           2
98449 +struct xen_sysctl_tbuf_op {
98450 +    /* IN variables */
98451 +#define XEN_SYSCTL_TBUFOP_get_info     0
98452 +#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
98453 +#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
98454 +#define XEN_SYSCTL_TBUFOP_set_size     3
98455 +#define XEN_SYSCTL_TBUFOP_enable       4
98456 +#define XEN_SYSCTL_TBUFOP_disable      5
98457 +    uint32_t cmd;
98458 +    /* IN/OUT variables */
98459 +    struct xenctl_cpumap cpu_mask;
98460 +    uint32_t             evt_mask;
98461 +    /* OUT variables */
98462 +    uint64_t buffer_mfn;
98463 +    uint32_t size;
98464 +};
98465 +typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
98466 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
98467 +
98468 +/*
98469 + * Get physical information about the host machine
98470 + */
98471 +#define XEN_SYSCTL_physinfo          3
98472 +struct xen_sysctl_physinfo {
98473 +    uint32_t threads_per_core;
98474 +    uint32_t cores_per_socket;
98475 +    uint32_t sockets_per_node;
98476 +    uint32_t nr_nodes;
98477 +    uint32_t cpu_khz;
98478 +    uint64_t total_pages;
98479 +    uint64_t free_pages;
98480 +    uint64_t scrub_pages;
98481 +    uint32_t hw_cap[8];
98482 +};
98483 +typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
98484 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
98485 +
98486 +/*
98487 + * Get the ID of the current scheduler.
98488 + */
98489 +#define XEN_SYSCTL_sched_id          4
98490 +struct xen_sysctl_sched_id {
98491 +    /* OUT variable */
98492 +    uint32_t sched_id;
98493 +};
98494 +typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t;
98495 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t);
98496 +
98497 +/* Interface for controlling Xen software performance counters. */
98498 +#define XEN_SYSCTL_perfc_op          5
98499 +/* Sub-operations: */
98500 +#define XEN_SYSCTL_PERFCOP_reset 1   /* Reset all counters to zero. */
98501 +#define XEN_SYSCTL_PERFCOP_query 2   /* Get perfctr information. */
98502 +struct xen_sysctl_perfc_desc {
98503 +    char         name[80];             /* name of perf counter */
98504 +    uint32_t     nr_vals;              /* number of values for this counter */
98505 +};
98506 +typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t;
98507 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t);
98508 +typedef uint32_t xen_sysctl_perfc_val_t;
98509 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t);
98510 +
98511 +struct xen_sysctl_perfc_op {
98512 +    /* IN variables. */
98513 +    uint32_t       cmd;                /*  XEN_SYSCTL_PERFCOP_??? */
98514 +    /* OUT variables. */
98515 +    uint32_t       nr_counters;       /*  number of counters description  */
98516 +    uint32_t       nr_vals;           /*  number of values  */
98517 +    /* counter information (or NULL) */
98518 +    XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t) desc;
98519 +    /* counter values (or NULL) */
98520 +    XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t) val;
98521 +};
98522 +typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t;
98523 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t);
98524 +
98525 +#define XEN_SYSCTL_getdomaininfolist 6
98526 +struct xen_sysctl_getdomaininfolist {
98527 +    /* IN variables. */
98528 +    domid_t               first_domain;
98529 +    uint32_t              max_domains;
98530 +    XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t) buffer;
98531 +    /* OUT variables. */
98532 +    uint32_t              num_domains;
98533 +};
98534 +typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t;
98535 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t);
98536 +
98537 +struct xen_sysctl {
98538 +    uint32_t cmd;
98539 +    uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
98540 +    union {
98541 +        struct xen_sysctl_readconsole       readconsole;
98542 +        struct xen_sysctl_tbuf_op           tbuf_op;
98543 +        struct xen_sysctl_physinfo          physinfo;
98544 +        struct xen_sysctl_sched_id          sched_id;
98545 +        struct xen_sysctl_perfc_op          perfc_op;
98546 +        struct xen_sysctl_getdomaininfolist getdomaininfolist;
98547 +        uint8_t                             pad[128];
98548 +    } u;
98549 +};
98550 +typedef struct xen_sysctl xen_sysctl_t;
98551 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t);
98552 +
98553 +#endif /* __XEN_PUBLIC_SYSCTL_H__ */
98554 +
98555 +/*
98556 + * Local variables:
98557 + * mode: C
98558 + * c-set-style: "BSD"
98559 + * c-basic-offset: 4
98560 + * tab-width: 4
98561 + * indent-tabs-mode: nil
98562 + * End:
98563 + */
98564 diff -ruNp linux-2.6.19/include/xen/interface/trace.h linux-2.6.19-xen-3.0.4/include/xen/interface/trace.h
98565 --- linux-2.6.19/include/xen/interface/trace.h  1970-01-01 00:00:00.000000000 +0000
98566 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/trace.h        2007-02-02 19:11:00.000000000 +0000
98567 @@ -0,0 +1,102 @@
98568 +/******************************************************************************
98569 + * include/public/trace.h
98570 + * 
98571 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98572 + * of this software and associated documentation files (the "Software"), to
98573 + * deal in the Software without restriction, including without limitation the
98574 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98575 + * sell copies of the Software, and to permit persons to whom the Software is
98576 + * furnished to do so, subject to the following conditions:
98577 + *
98578 + * The above copyright notice and this permission notice shall be included in
98579 + * all copies or substantial portions of the Software.
98580 + *
98581 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98582 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98583 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98584 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98585 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98586 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98587 + * DEALINGS IN THE SOFTWARE.
98588 + *
98589 + * Mark Williamson, (C) 2004 Intel Research Cambridge
98590 + * Copyright (C) 2005 Bin Ren
98591 + */
98592 +
98593 +#ifndef __XEN_PUBLIC_TRACE_H__
98594 +#define __XEN_PUBLIC_TRACE_H__
98595 +
98596 +/* Trace classes */
98597 +#define TRC_CLS_SHIFT 16
98598 +#define TRC_GEN     0x0001f000    /* General trace            */
98599 +#define TRC_SCHED   0x0002f000    /* Xen Scheduler trace      */
98600 +#define TRC_DOM0OP  0x0004f000    /* Xen DOM0 operation trace */
98601 +#define TRC_VMX     0x0008f000    /* Xen VMX trace            */
98602 +#define TRC_MEM     0x0010f000    /* Xen memory trace         */
98603 +#define TRC_ALL     0xfffff000
98604 +
98605 +/* Trace subclasses */
98606 +#define TRC_SUBCLS_SHIFT 12
98607 +
98608 +/* trace subclasses for VMX */
98609 +#define TRC_VMXEXIT  0x00081000   /* VMX exit trace            */
98610 +#define TRC_VMXENTRY 0x00082000   /* VMX exit trace            */
98611 +#define TRC_VMXINTR  0x00084000   /* VMX interrupt trace       */
98612 +
98613 +/* Trace events per class */
98614 +#define TRC_LOST_RECORDS        (TRC_GEN + 1)
98615 +
98616 +#define TRC_SCHED_DOM_ADD       (TRC_SCHED +  1)
98617 +#define TRC_SCHED_DOM_REM       (TRC_SCHED +  2)
98618 +#define TRC_SCHED_SLEEP         (TRC_SCHED +  3)
98619 +#define TRC_SCHED_WAKE          (TRC_SCHED +  4)
98620 +#define TRC_SCHED_YIELD         (TRC_SCHED +  5)
98621 +#define TRC_SCHED_BLOCK         (TRC_SCHED +  6)
98622 +#define TRC_SCHED_SHUTDOWN      (TRC_SCHED +  7)
98623 +#define TRC_SCHED_CTL           (TRC_SCHED +  8)
98624 +#define TRC_SCHED_ADJDOM        (TRC_SCHED +  9)
98625 +#define TRC_SCHED_SWITCH        (TRC_SCHED + 10)
98626 +#define TRC_SCHED_S_TIMER_FN    (TRC_SCHED + 11)
98627 +#define TRC_SCHED_T_TIMER_FN    (TRC_SCHED + 12)
98628 +#define TRC_SCHED_DOM_TIMER_FN  (TRC_SCHED + 13)
98629 +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
98630 +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
98631 +
98632 +#define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
98633 +#define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
98634 +#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
98635 +
98636 +/* trace events per subclass */
98637 +#define TRC_VMX_VMEXIT          (TRC_VMXEXIT + 1)
98638 +#define TRC_VMX_VMENTRY         (TRC_VMXENTRY + 1)
98639 +#define TRC_VMX_INTR            (TRC_VMXINTR + 1)
98640 +
98641 +
98642 +/* This structure represents a single trace buffer record. */
98643 +struct t_rec {
98644 +    uint64_t cycles;          /* cycle counter timestamp */
98645 +    uint32_t event;           /* event ID                */
98646 +    unsigned long data[5];    /* event data items        */
98647 +};
98648 +
98649 +/*
98650 + * This structure contains the metadata for a single trace buffer.  The head
98651 + * field, indexes into an array of struct t_rec's.
98652 + */
98653 +struct t_buf {
98654 +    uint32_t cons;      /* Next item to be consumed by control tools. */
98655 +    uint32_t prod;      /* Next item to be produced by Xen.           */
98656 +    /* 'nr_recs' records follow immediately after the meta-data header.    */
98657 +};
98658 +
98659 +#endif /* __XEN_PUBLIC_TRACE_H__ */
98660 +
98661 +/*
98662 + * Local variables:
98663 + * mode: C
98664 + * c-set-style: "BSD"
98665 + * c-basic-offset: 4
98666 + * tab-width: 4
98667 + * indent-tabs-mode: nil
98668 + * End:
98669 + */
98670 diff -ruNp linux-2.6.19/include/xen/interface/vcpu.h linux-2.6.19-xen-3.0.4/include/xen/interface/vcpu.h
98671 --- linux-2.6.19/include/xen/interface/vcpu.h   1970-01-01 00:00:00.000000000 +0000
98672 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/vcpu.h 2007-02-02 19:11:00.000000000 +0000
98673 @@ -0,0 +1,142 @@
98674 +/******************************************************************************
98675 + * vcpu.h
98676 + * 
98677 + * VCPU initialisation, query, and hotplug.
98678 + * 
98679 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98680 + * of this software and associated documentation files (the "Software"), to
98681 + * deal in the Software without restriction, including without limitation the
98682 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98683 + * sell copies of the Software, and to permit persons to whom the Software is
98684 + * furnished to do so, subject to the following conditions:
98685 + *
98686 + * The above copyright notice and this permission notice shall be included in
98687 + * all copies or substantial portions of the Software.
98688 + *
98689 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98690 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98691 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98692 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98693 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98694 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98695 + * DEALINGS IN THE SOFTWARE.
98696 + *
98697 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
98698 + */
98699 +
98700 +#ifndef __XEN_PUBLIC_VCPU_H__
98701 +#define __XEN_PUBLIC_VCPU_H__
98702 +
98703 +/*
98704 + * Prototype for this hypercall is:
98705 + *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
98706 + * @cmd        == VCPUOP_??? (VCPU operation).
98707 + * @vcpuid     == VCPU to operate on.
98708 + * @extra_args == Operation-specific extra arguments (NULL if none).
98709 + */
98710 +
98711 +/*
98712 + * Initialise a VCPU. Each VCPU can be initialised only once. A 
98713 + * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
98714 + * 
98715 + * @extra_arg == pointer to vcpu_guest_context structure containing initial
98716 + *               state for the VCPU.
98717 + */
98718 +#define VCPUOP_initialise           0
98719 +
98720 +/*
98721 + * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
98722 + * if the VCPU has not been initialised (VCPUOP_initialise).
98723 + */
98724 +#define VCPUOP_up                   1
98725 +
98726 +/*
98727 + * Bring down a VCPU (i.e., make it non-runnable).
98728 + * There are a few caveats that callers should observe:
98729 + *  1. This operation may return, and VCPU_is_up may return false, before the
98730 + *     VCPU stops running (i.e., the command is asynchronous). It is a good
98731 + *     idea to ensure that the VCPU has entered a non-critical loop before
98732 + *     bringing it down. Alternatively, this operation is guaranteed
98733 + *     synchronous if invoked by the VCPU itself.
98734 + *  2. After a VCPU is initialised, there is currently no way to drop all its
98735 + *     references to domain memory. Even a VCPU that is down still holds
98736 + *     memory references via its pagetable base pointer and GDT. It is good
98737 + *     practise to move a VCPU onto an 'idle' or default page table, LDT and
98738 + *     GDT before bringing it down.
98739 + */
98740 +#define VCPUOP_down                 2
98741 +
98742 +/* Returns 1 if the given VCPU is up. */
98743 +#define VCPUOP_is_up                3
98744 +
98745 +/*
98746 + * Return information about the state and running time of a VCPU.
98747 + * @extra_arg == pointer to vcpu_runstate_info structure.
98748 + */
98749 +#define VCPUOP_get_runstate_info    4
98750 +struct vcpu_runstate_info {
98751 +    /* VCPU's current state (RUNSTATE_*). */
98752 +    int      state;
98753 +    /* When was current state entered (system time, ns)? */
98754 +    uint64_t state_entry_time;
98755 +    /*
98756 +     * Time spent in each RUNSTATE_* (ns). The sum of these times is
98757 +     * guaranteed not to drift from system time.
98758 +     */
98759 +    uint64_t time[4];
98760 +};
98761 +typedef struct vcpu_runstate_info vcpu_runstate_info_t;
98762 +DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
98763 +
98764 +/* VCPU is currently running on a physical CPU. */
98765 +#define RUNSTATE_running  0
98766 +
98767 +/* VCPU is runnable, but not currently scheduled on any physical CPU. */
98768 +#define RUNSTATE_runnable 1
98769 +
98770 +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
98771 +#define RUNSTATE_blocked  2
98772 +
98773 +/*
98774 + * VCPU is not runnable, but it is not blocked.
98775 + * This is a 'catch all' state for things like hotplug and pauses by the
98776 + * system administrator (or for critical sections in the hypervisor).
98777 + * RUNSTATE_blocked dominates this state (it is the preferred state).
98778 + */
98779 +#define RUNSTATE_offline  3
98780 +
98781 +/*
98782 + * Register a shared memory area from which the guest may obtain its own
98783 + * runstate information without needing to execute a hypercall.
98784 + * Notes:
98785 + *  1. The registered address may be virtual or physical or guest handle,
98786 + *     depending on the platform. Virtual address or guest handle should be
98787 + *     registered on x86 systems.
98788 + *  2. Only one shared area may be registered per VCPU. The shared area is
98789 + *     updated by the hypervisor each time the VCPU is scheduled. Thus
98790 + *     runstate.state will always be RUNSTATE_running and
98791 + *     runstate.state_entry_time will indicate the system time at which the
98792 + *     VCPU was last scheduled to run.
98793 + * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
98794 + */
98795 +#define VCPUOP_register_runstate_memory_area 5
98796 +struct vcpu_register_runstate_memory_area {
98797 +    union {
98798 +        XEN_GUEST_HANDLE(vcpu_runstate_info_t) h;
98799 +        struct vcpu_runstate_info *v;
98800 +        uint64_t p;
98801 +    } addr;
98802 +};
98803 +typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
98804 +
98805 +#endif /* __XEN_PUBLIC_VCPU_H__ */
98806 +
98807 +/*
98808 + * Local variables:
98809 + * mode: C
98810 + * c-set-style: "BSD"
98811 + * c-basic-offset: 4
98812 + * tab-width: 4
98813 + * indent-tabs-mode: nil
98814 + * End:
98815 + */
98816 diff -ruNp linux-2.6.19/include/xen/interface/version.h linux-2.6.19-xen-3.0.4/include/xen/interface/version.h
98817 --- linux-2.6.19/include/xen/interface/version.h        1970-01-01 00:00:00.000000000 +0000
98818 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/version.h      2007-02-02 19:11:00.000000000 +0000
98819 @@ -0,0 +1,91 @@
98820 +/******************************************************************************
98821 + * version.h
98822 + * 
98823 + * Xen version, type, and compile information.
98824 + * 
98825 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98826 + * of this software and associated documentation files (the "Software"), to
98827 + * deal in the Software without restriction, including without limitation the
98828 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98829 + * sell copies of the Software, and to permit persons to whom the Software is
98830 + * furnished to do so, subject to the following conditions:
98831 + *
98832 + * The above copyright notice and this permission notice shall be included in
98833 + * all copies or substantial portions of the Software.
98834 + *
98835 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98836 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98837 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98838 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98839 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98840 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98841 + * DEALINGS IN THE SOFTWARE.
98842 + *
98843 + * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
98844 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
98845 + */
98846 +
98847 +#ifndef __XEN_PUBLIC_VERSION_H__
98848 +#define __XEN_PUBLIC_VERSION_H__
98849 +
98850 +/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
98851 +
98852 +/* arg == NULL; returns major:minor (16:16). */
98853 +#define XENVER_version      0
98854 +
98855 +/* arg == xen_extraversion_t. */
98856 +#define XENVER_extraversion 1
98857 +typedef char xen_extraversion_t[16];
98858 +#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
98859 +
98860 +/* arg == xen_compile_info_t. */
98861 +#define XENVER_compile_info 2
98862 +struct xen_compile_info {
98863 +    char compiler[64];
98864 +    char compile_by[16];
98865 +    char compile_domain[32];
98866 +    char compile_date[32];
98867 +};
98868 +typedef struct xen_compile_info xen_compile_info_t;
98869 +
98870 +#define XENVER_capabilities 3
98871 +typedef char xen_capabilities_info_t[1024];
98872 +#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
98873 +
98874 +#define XENVER_changeset 4
98875 +typedef char xen_changeset_info_t[64];
98876 +#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
98877 +
98878 +#define XENVER_platform_parameters 5
98879 +struct xen_platform_parameters {
98880 +    unsigned long virt_start;
98881 +};
98882 +typedef struct xen_platform_parameters xen_platform_parameters_t;
98883 +
98884 +#define XENVER_get_features 6
98885 +struct xen_feature_info {
98886 +    unsigned int submap_idx;    /* IN: which 32-bit submap to return */
98887 +    uint32_t     submap;        /* OUT: 32-bit submap */
98888 +};
98889 +typedef struct xen_feature_info xen_feature_info_t;
98890 +
98891 +/* Declares the features reported by XENVER_get_features. */
98892 +#include "features.h"
98893 +
98894 +/* arg == NULL; returns host memory page size. */
98895 +#define XENVER_pagesize 7
98896 +
98897 +/* arg == xen_domain_handle_t. */
98898 +#define XENVER_guest_handle 8
98899 +
98900 +#endif /* __XEN_PUBLIC_VERSION_H__ */
98901 +
98902 +/*
98903 + * Local variables:
98904 + * mode: C
98905 + * c-set-style: "BSD"
98906 + * c-basic-offset: 4
98907 + * tab-width: 4
98908 + * indent-tabs-mode: nil
98909 + * End:
98910 + */
98911 diff -ruNp linux-2.6.19/include/xen/interface/xen-compat.h linux-2.6.19-xen-3.0.4/include/xen/interface/xen-compat.h
98912 --- linux-2.6.19/include/xen/interface/xen-compat.h     1970-01-01 00:00:00.000000000 +0000
98913 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/xen-compat.h   2007-02-02 19:11:00.000000000 +0000
98914 @@ -0,0 +1,51 @@
98915 +/******************************************************************************
98916 + * xen-compat.h
98917 + * 
98918 + * Guest OS interface to Xen.  Compatibility layer.
98919 + * 
98920 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98921 + * of this software and associated documentation files (the "Software"), to
98922 + * deal in the Software without restriction, including without limitation the
98923 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98924 + * sell copies of the Software, and to permit persons to whom the Software is
98925 + * furnished to do so, subject to the following conditions:
98926 + *
98927 + * The above copyright notice and this permission notice shall be included in
98928 + * all copies or substantial portions of the Software.
98929 + *
98930 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98931 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98932 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98933 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98934 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98935 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98936 + * DEALINGS IN THE SOFTWARE.
98937 + *
98938 + * Copyright (c) 2006, Christian Limpach
98939 + */
98940 +
98941 +#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
98942 +#define __XEN_PUBLIC_XEN_COMPAT_H__
98943 +
98944 +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030205
98945 +
98946 +#if defined(__XEN__) || defined(__XEN_TOOLS__)
98947 +/* Xen is built with matching headers and implements the latest interface. */
98948 +#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
98949 +#elif !defined(__XEN_INTERFACE_VERSION__)
98950 +/* Guests which do not specify a version get the legacy interface. */
98951 +#define __XEN_INTERFACE_VERSION__ 0x00000000
98952 +#endif
98953 +
98954 +#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
98955 +#error "These header files do not support the requested interface version."
98956 +#endif
98957 +
98958 +/* Fields defined as a Xen guest handle since 0x00030205. */
98959 +#if __XEN_INTERFACE_VERSION__ >= 0x00030205
98960 +#define XEN_GUEST_HANDLE_00030205(type) XEN_GUEST_HANDLE(type)
98961 +#else
98962 +#define XEN_GUEST_HANDLE_00030205(type) type *
98963 +#endif
98964 +
98965 +#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
98966 diff -ruNp linux-2.6.19/include/xen/interface/xen.h linux-2.6.19-xen-3.0.4/include/xen/interface/xen.h
98967 --- linux-2.6.19/include/xen/interface/xen.h    1970-01-01 00:00:00.000000000 +0000
98968 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/xen.h  2007-02-02 19:11:00.000000000 +0000
98969 @@ -0,0 +1,597 @@
98970 +/******************************************************************************
98971 + * xen.h
98972 + * 
98973 + * Guest OS interface to Xen.
98974 + * 
98975 + * Permission is hereby granted, free of charge, to any person obtaining a copy
98976 + * of this software and associated documentation files (the "Software"), to
98977 + * deal in the Software without restriction, including without limitation the
98978 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
98979 + * sell copies of the Software, and to permit persons to whom the Software is
98980 + * furnished to do so, subject to the following conditions:
98981 + *
98982 + * The above copyright notice and this permission notice shall be included in
98983 + * all copies or substantial portions of the Software.
98984 + *
98985 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
98986 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
98987 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
98988 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
98989 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
98990 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
98991 + * DEALINGS IN THE SOFTWARE.
98992 + *
98993 + * Copyright (c) 2004, K A Fraser
98994 + */
98995 +
98996 +#ifndef __XEN_PUBLIC_XEN_H__
98997 +#define __XEN_PUBLIC_XEN_H__
98998 +
98999 +#include "xen-compat.h"
99000 +
99001 +#if defined(__i386__) || defined(__x86_64__)
99002 +#include "arch-x86/xen.h"
99003 +#elif defined(__ia64__)
99004 +#include "arch-ia64.h"
99005 +#elif defined(__powerpc__)
99006 +#include "arch-powerpc.h"
99007 +#else
99008 +#error "Unsupported architecture"
99009 +#endif
99010 +
99011 +/*
99012 + * HYPERCALLS
99013 + */
99014 +
99015 +#define __HYPERVISOR_set_trap_table        0
99016 +#define __HYPERVISOR_mmu_update            1
99017 +#define __HYPERVISOR_set_gdt               2
99018 +#define __HYPERVISOR_stack_switch          3
99019 +#define __HYPERVISOR_set_callbacks         4
99020 +#define __HYPERVISOR_fpu_taskswitch        5
99021 +#define __HYPERVISOR_sched_op_compat       6 /* compat since 0x00030101 */
99022 +#define __HYPERVISOR_platform_op           7
99023 +#define __HYPERVISOR_set_debugreg          8
99024 +#define __HYPERVISOR_get_debugreg          9
99025 +#define __HYPERVISOR_update_descriptor    10
99026 +#define __HYPERVISOR_memory_op            12
99027 +#define __HYPERVISOR_multicall            13
99028 +#define __HYPERVISOR_update_va_mapping    14
99029 +#define __HYPERVISOR_set_timer_op         15
99030 +#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
99031 +#define __HYPERVISOR_xen_version          17
99032 +#define __HYPERVISOR_console_io           18
99033 +#define __HYPERVISOR_physdev_op_compat    19 /* compat since 0x00030202 */
99034 +#define __HYPERVISOR_grant_table_op       20
99035 +#define __HYPERVISOR_vm_assist            21
99036 +#define __HYPERVISOR_update_va_mapping_otherdomain 22
99037 +#define __HYPERVISOR_iret                 23 /* x86 only */
99038 +#define __HYPERVISOR_vcpu_op              24
99039 +#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
99040 +#define __HYPERVISOR_mmuext_op            26
99041 +#define __HYPERVISOR_acm_op               27
99042 +#define __HYPERVISOR_nmi_op               28
99043 +#define __HYPERVISOR_sched_op             29
99044 +#define __HYPERVISOR_callback_op          30
99045 +#define __HYPERVISOR_xenoprof_op          31
99046 +#define __HYPERVISOR_event_channel_op     32
99047 +#define __HYPERVISOR_physdev_op           33
99048 +#define __HYPERVISOR_hvm_op               34
99049 +#define __HYPERVISOR_sysctl               35
99050 +#define __HYPERVISOR_domctl               36
99051 +#define __HYPERVISOR_kexec_op             37
99052 +
99053 +/* Architecture-specific hypercall definitions. */
99054 +#define __HYPERVISOR_arch_0               48
99055 +#define __HYPERVISOR_arch_1               49
99056 +#define __HYPERVISOR_arch_2               50
99057 +#define __HYPERVISOR_arch_3               51
99058 +#define __HYPERVISOR_arch_4               52
99059 +#define __HYPERVISOR_arch_5               53
99060 +#define __HYPERVISOR_arch_6               54
99061 +#define __HYPERVISOR_arch_7               55
99062 +
99063 +/*
99064 + * HYPERCALL COMPATIBILITY.
99065 + */
99066 +
99067 +/* New sched_op hypercall introduced in 0x00030101. */
99068 +#if __XEN_INTERFACE_VERSION__ < 0x00030101
99069 +#undef __HYPERVISOR_sched_op
99070 +#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
99071 +#endif
99072 +
99073 +/* New event-channel and physdev hypercalls introduced in 0x00030202. */
99074 +#if __XEN_INTERFACE_VERSION__ < 0x00030202
99075 +#undef __HYPERVISOR_event_channel_op
99076 +#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
99077 +#undef __HYPERVISOR_physdev_op
99078 +#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
99079 +#endif
99080 +
99081 +/* New platform_op hypercall introduced in 0x00030204. */
99082 +#if __XEN_INTERFACE_VERSION__ < 0x00030204
99083 +#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op
99084 +#endif
99085 +
99086 +/* 
99087 + * VIRTUAL INTERRUPTS
99088 + * 
99089 + * Virtual interrupts that a guest OS may receive from Xen.
99090 + * 
99091 + * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
99092 + * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
99093 + * The latter can be allocated only once per guest: they must initially be
99094 + * allocated to VCPU0 but can subsequently be re-bound.
99095 + */
99096 +#define VIRQ_TIMER      0  /* V. Timebase update, and/or requested timeout.  */
99097 +#define VIRQ_DEBUG      1  /* V. Request guest to dump debug info.           */
99098 +#define VIRQ_CONSOLE    2  /* G. (DOM0) Bytes received on emergency console. */
99099 +#define VIRQ_DOM_EXC    3  /* G. (DOM0) Exceptional event for some domain.   */
99100 +#define VIRQ_TBUF       4  /* G. (DOM0) Trace buffer has records available.  */
99101 +#define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
99102 +#define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
99103 +
99104 +/* Architecture-specific VIRQ definitions. */
99105 +#define VIRQ_ARCH_0    16
99106 +#define VIRQ_ARCH_1    17
99107 +#define VIRQ_ARCH_2    18
99108 +#define VIRQ_ARCH_3    19
99109 +#define VIRQ_ARCH_4    20
99110 +#define VIRQ_ARCH_5    21
99111 +#define VIRQ_ARCH_6    22
99112 +#define VIRQ_ARCH_7    23
99113 +
99114 +#define NR_VIRQS       24
99115 +
99116 +/*
99117 + * MMU-UPDATE REQUESTS
99118 + * 
99119 + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
99120 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
99121 + * Where the FD has some effect, it is described below.
99122 + * ptr[1:0] specifies the appropriate MMU_* command.
99123 + * 
99124 + * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
99125 + * Updates an entry in a page table. If updating an L1 table, and the new
99126 + * table entry is valid/present, the mapped frame must belong to the FD, if
99127 + * an FD has been specified. If attempting to map an I/O page then the
99128 + * caller assumes the privilege of the FD.
99129 + * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
99130 + * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
99131 + * ptr[:2]  -- Machine address of the page-table entry to modify.
99132 + * val      -- Value to write.
99133 + * 
99134 + * ptr[1:0] == MMU_MACHPHYS_UPDATE:
99135 + * Updates an entry in the machine->pseudo-physical mapping table.
99136 + * ptr[:2]  -- Machine address within the frame whose mapping to modify.
99137 + *             The frame must belong to the FD, if one is specified.
99138 + * val      -- Value to write into the mapping entry.
99139 + */
99140 +#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
99141 +#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
99142 +
99143 +/*
99144 + * MMU EXTENDED OPERATIONS
99145 + * 
99146 + * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
99147 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
99148 + * Where the FD has some effect, it is described below.
99149 + * 
99150 + * cmd: MMUEXT_(UN)PIN_*_TABLE
99151 + * mfn: Machine frame number to be (un)pinned as a p.t. page.
99152 + *      The frame must belong to the FD, if one is specified.
99153 + * 
99154 + * cmd: MMUEXT_NEW_BASEPTR
99155 + * mfn: Machine frame number of new page-table base to install in MMU.
99156 + * 
99157 + * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
99158 + * mfn: Machine frame number of new page-table base to install in MMU
99159 + *      when in user space.
99160 + * 
99161 + * cmd: MMUEXT_TLB_FLUSH_LOCAL
99162 + * No additional arguments. Flushes local TLB.
99163 + * 
99164 + * cmd: MMUEXT_INVLPG_LOCAL
99165 + * linear_addr: Linear address to be flushed from the local TLB.
99166 + * 
99167 + * cmd: MMUEXT_TLB_FLUSH_MULTI
99168 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
99169 + * 
99170 + * cmd: MMUEXT_INVLPG_MULTI
99171 + * linear_addr: Linear address to be flushed.
99172 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
99173 + * 
99174 + * cmd: MMUEXT_TLB_FLUSH_ALL
99175 + * No additional arguments. Flushes all VCPUs' TLBs.
99176 + * 
99177 + * cmd: MMUEXT_INVLPG_ALL
99178 + * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
99179 + * 
99180 + * cmd: MMUEXT_FLUSH_CACHE
99181 + * No additional arguments. Writes back and flushes cache contents.
99182 + * 
99183 + * cmd: MMUEXT_SET_LDT
99184 + * linear_addr: Linear address of LDT base (NB. must be page-aligned).
99185 + * nr_ents: Number of entries in LDT.
99186 + */
99187 +#define MMUEXT_PIN_L1_TABLE      0
99188 +#define MMUEXT_PIN_L2_TABLE      1
99189 +#define MMUEXT_PIN_L3_TABLE      2
99190 +#define MMUEXT_PIN_L4_TABLE      3
99191 +#define MMUEXT_UNPIN_TABLE       4
99192 +#define MMUEXT_NEW_BASEPTR       5
99193 +#define MMUEXT_TLB_FLUSH_LOCAL   6
99194 +#define MMUEXT_INVLPG_LOCAL      7
99195 +#define MMUEXT_TLB_FLUSH_MULTI   8
99196 +#define MMUEXT_INVLPG_MULTI      9
99197 +#define MMUEXT_TLB_FLUSH_ALL    10
99198 +#define MMUEXT_INVLPG_ALL       11
99199 +#define MMUEXT_FLUSH_CACHE      12
99200 +#define MMUEXT_SET_LDT          13
99201 +#define MMUEXT_NEW_USER_BASEPTR 15
99202 +
99203 +#ifndef __ASSEMBLY__
99204 +struct mmuext_op {
99205 +    unsigned int cmd;
99206 +    union {
99207 +        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
99208 +        xen_pfn_t     mfn;
99209 +        /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
99210 +        unsigned long linear_addr;
99211 +    } arg1;
99212 +    union {
99213 +        /* SET_LDT */
99214 +        unsigned int nr_ents;
99215 +        /* TLB_FLUSH_MULTI, INVLPG_MULTI */
99216 +        XEN_GUEST_HANDLE_00030205(void) vcpumask;
99217 +    } arg2;
99218 +};
99219 +typedef struct mmuext_op mmuext_op_t;
99220 +DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
99221 +#endif
99222 +
99223 +/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
99224 +/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
99225 +/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
99226 +#define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
99227 +#define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
99228 +#define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
99229 +#define UVMF_FLUSHTYPE_MASK     (3UL<<0)
99230 +#define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
99231 +#define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
99232 +#define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
99233 +
99234 +/*
99235 + * Commands to HYPERVISOR_console_io().
99236 + */
99237 +#define CONSOLEIO_write         0
99238 +#define CONSOLEIO_read          1
99239 +
99240 +/*
99241 + * Commands to HYPERVISOR_vm_assist().
99242 + */
99243 +#define VMASST_CMD_enable                0
99244 +#define VMASST_CMD_disable               1
99245 +
99246 +/* x86/32 guests: simulate full 4GB segment limits. */
99247 +#define VMASST_TYPE_4gb_segments         0
99248 +
99249 +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
99250 +#define VMASST_TYPE_4gb_segments_notify  1
99251 +
99252 +/*
99253 + * x86 guests: support writes to bottom-level PTEs.
99254 + * NB1. Page-directory entries cannot be written.
99255 + * NB2. Guest must continue to remove all writable mappings of PTEs.
99256 + */
99257 +#define VMASST_TYPE_writable_pagetables  2
99258 +
99259 +/* x86/PAE guests: support PDPTs above 4GB. */
99260 +#define VMASST_TYPE_pae_extended_cr3     3
99261 +
99262 +#define MAX_VMASST_TYPE                  3
99263 +
99264 +#ifndef __ASSEMBLY__
99265 +
99266 +typedef uint16_t domid_t;
99267 +
99268 +/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
99269 +#define DOMID_FIRST_RESERVED (0x7FF0U)
99270 +
99271 +/* DOMID_SELF is used in certain contexts to refer to oneself. */
99272 +#define DOMID_SELF (0x7FF0U)
99273 +
99274 +/*
99275 + * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
99276 + * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
99277 + * is useful to ensure that no mappings to the OS's own heap are accidentally
99278 + * installed. (e.g., in Linux this could cause havoc as reference counts
99279 + * aren't adjusted on the I/O-mapping code path).
99280 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
99281 + * be specified by any calling domain.
99282 + */
99283 +#define DOMID_IO   (0x7FF1U)
99284 +
99285 +/*
99286 + * DOMID_XEN is used to allow privileged domains to map restricted parts of
99287 + * Xen's heap space (e.g., the machine_to_phys table).
99288 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
99289 + * the caller is privileged.
99290 + */
99291 +#define DOMID_XEN  (0x7FF2U)
99292 +
99293 +/*
99294 + * Send an array of these to HYPERVISOR_mmu_update().
99295 + * NB. The fields are natural pointer/address size for this architecture.
99296 + */
99297 +struct mmu_update {
99298 +    uint64_t ptr;       /* Machine address of PTE. */
99299 +    uint64_t val;       /* New contents of PTE.    */
99300 +};
99301 +typedef struct mmu_update mmu_update_t;
99302 +DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
99303 +
99304 +/*
99305 + * Send an array of these to HYPERVISOR_multicall().
99306 + * NB. The fields are natural register size for this architecture.
99307 + */
99308 +struct multicall_entry {
99309 +    unsigned long op, result;
99310 +    unsigned long args[6];
99311 +};
99312 +typedef struct multicall_entry multicall_entry_t;
99313 +DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
99314 +
99315 +/*
99316 + * Event channel endpoints per domain:
99317 + *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
99318 + */
99319 +#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
99320 +
99321 +struct vcpu_time_info {
99322 +    /*
99323 +     * Updates to the following values are preceded and followed by an
99324 +     * increment of 'version'. The guest can therefore detect updates by
99325 +     * looking for changes to 'version'. If the least-significant bit of
99326 +     * the version number is set then an update is in progress and the guest
99327 +     * must wait to read a consistent set of values.
99328 +     * The correct way to interact with the version number is similar to
99329 +     * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
99330 +     */
99331 +    uint32_t version;
99332 +    uint32_t pad0;
99333 +    uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
99334 +    uint64_t system_time;     /* Time, in nanosecs, since boot.    */
99335 +    /*
99336 +     * Current system time:
99337 +     *   system_time +
99338 +     *   ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
99339 +     * CPU frequency (Hz):
99340 +     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
99341 +     */
99342 +    uint32_t tsc_to_system_mul;
99343 +    int8_t   tsc_shift;
99344 +    int8_t   pad1[3];
99345 +}; /* 32 bytes */
99346 +typedef struct vcpu_time_info vcpu_time_info_t;
99347 +
99348 +struct vcpu_info {
99349 +    /*
99350 +     * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
99351 +     * a pending notification for a particular VCPU. It is then cleared 
99352 +     * by the guest OS /before/ checking for pending work, thus avoiding
99353 +     * a set-and-check race. Note that the mask is only accessed by Xen
99354 +     * on the CPU that is currently hosting the VCPU. This means that the
99355 +     * pending and mask flags can be updated by the guest without special
99356 +     * synchronisation (i.e., no need for the x86 LOCK prefix).
99357 +     * This may seem suboptimal because if the pending flag is set by
99358 +     * a different CPU then an IPI may be scheduled even when the mask
99359 +     * is set. However, note:
99360 +     *  1. The task of 'interrupt holdoff' is covered by the per-event-
99361 +     *     channel mask bits. A 'noisy' event that is continually being
99362 +     *     triggered can be masked at source at this very precise
99363 +     *     granularity.
99364 +     *  2. The main purpose of the per-VCPU mask is therefore to restrict
99365 +     *     reentrant execution: whether for concurrency control, or to
99366 +     *     prevent unbounded stack usage. Whatever the purpose, we expect
99367 +     *     that the mask will be asserted only for short periods at a time,
99368 +     *     and so the likelihood of a 'spurious' IPI is suitably small.
99369 +     * The mask is read before making an event upcall to the guest: a
99370 +     * non-zero mask therefore guarantees that the VCPU will not receive
99371 +     * an upcall activation. The mask is cleared when the VCPU requests
99372 +     * to block: this avoids wakeup-waiting races.
99373 +     */
99374 +    uint8_t evtchn_upcall_pending;
99375 +    uint8_t evtchn_upcall_mask;
99376 +    unsigned long evtchn_pending_sel;
99377 +    struct arch_vcpu_info arch;
99378 +    struct vcpu_time_info time;
99379 +}; /* 64 bytes (x86) */
99380 +typedef struct vcpu_info vcpu_info_t;
99381 +
99382 +/*
99383 + * Xen/kernel shared data -- pointer provided in start_info.
99384 + *
99385 + * This structure is defined to be both smaller than a page, and the
99386 + * only data on the shared page, but may vary in actual size even within
99387 + * compatible Xen versions; guests should not rely on the size
99388 + * of this structure remaining constant.
99389 + */
99390 +struct shared_info {
99391 +    struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
99392 +
99393 +    /*
99394 +     * A domain can create "event channels" on which it can send and receive
99395 +     * asynchronous event notifications. There are three classes of event that
99396 +     * are delivered by this mechanism:
99397 +     *  1. Bi-directional inter- and intra-domain connections. Domains must
99398 +     *     arrange out-of-band to set up a connection (usually by allocating
99399 +     *     an unbound 'listener' port and avertising that via a storage service
99400 +     *     such as xenstore).
99401 +     *  2. Physical interrupts. A domain with suitable hardware-access
99402 +     *     privileges can bind an event-channel port to a physical interrupt
99403 +     *     source.
99404 +     *  3. Virtual interrupts ('events'). A domain can bind an event-channel
99405 +     *     port to a virtual interrupt source, such as the virtual-timer
99406 +     *     device or the emergency console.
99407 +     * 
99408 +     * Event channels are addressed by a "port index". Each channel is
99409 +     * associated with two bits of information:
99410 +     *  1. PENDING -- notifies the domain that there is a pending notification
99411 +     *     to be processed. This bit is cleared by the guest.
99412 +     *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
99413 +     *     will cause an asynchronous upcall to be scheduled. This bit is only
99414 +     *     updated by the guest. It is read-only within Xen. If a channel
99415 +     *     becomes pending while the channel is masked then the 'edge' is lost
99416 +     *     (i.e., when the channel is unmasked, the guest must manually handle
99417 +     *     pending notifications as no upcall will be scheduled by Xen).
99418 +     * 
99419 +     * To expedite scanning of pending notifications, any 0->1 pending
99420 +     * transition on an unmasked channel causes a corresponding bit in a
99421 +     * per-vcpu selector word to be set. Each bit in the selector covers a
99422 +     * 'C long' in the PENDING bitfield array.
99423 +     */
99424 +    unsigned long evtchn_pending[sizeof(unsigned long) * 8];
99425 +    unsigned long evtchn_mask[sizeof(unsigned long) * 8];
99426 +
99427 +    /*
99428 +     * Wallclock time: updated only by control software. Guests should base
99429 +     * their gettimeofday() syscall on this wallclock-base value.
99430 +     */
99431 +    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
99432 +    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
99433 +    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
99434 +
99435 +    struct arch_shared_info arch;
99436 +
99437 +};
99438 +typedef struct shared_info shared_info_t;
99439 +
99440 +/*
99441 + * Start-of-day memory layout for the initial domain (DOM0):
99442 + *  1. The domain is started within contiguous virtual-memory region.
99443 + *  2. The contiguous region begins and ends on an aligned 4MB boundary.
99444 + *  3. The region start corresponds to the load address of the OS image.
99445 + *     If the load address is not 4MB aligned then the address is rounded down.
99446 + *  4. This the order of bootstrap elements in the initial virtual region:
99447 + *      a. relocated kernel image
99448 + *      b. initial ram disk              [mod_start, mod_len]
99449 + *      c. list of allocated page frames [mfn_list, nr_pages]
99450 + *      d. start_info_t structure        [register ESI (x86)]
99451 + *      e. bootstrap page tables         [pt_base, CR3 (x86)]
99452 + *      f. bootstrap stack               [register ESP (x86)]
99453 + *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
99454 + *  6. The initial ram disk may be omitted.
99455 + *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
99456 + *     layout for the domain. In particular, the bootstrap virtual-memory
99457 + *     region is a 1:1 mapping to the first section of the pseudo-physical map.
99458 + *  8. All bootstrap elements are mapped read-writable for the guest OS. The
99459 + *     only exception is the bootstrap page table, which is mapped read-only.
99460 + *  9. There is guaranteed to be at least 512kB padding after the final
99461 + *     bootstrap element. If necessary, the bootstrap virtual region is
99462 + *     extended by an extra 4MB to ensure this.
99463 + */
99464 +
99465 +#define MAX_GUEST_CMDLINE 1024
99466 +struct start_info {
99467 +    /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
99468 +    char magic[32];             /* "xen-<version>-<platform>".            */
99469 +    unsigned long nr_pages;     /* Total pages allocated to this domain.  */
99470 +    unsigned long shared_info;  /* MACHINE address of shared info struct. */
99471 +    uint32_t flags;             /* SIF_xxx flags.                         */
99472 +    xen_pfn_t store_mfn;        /* MACHINE page number of shared page.    */
99473 +    uint32_t store_evtchn;      /* Event channel for store communication. */
99474 +    union {
99475 +        struct {
99476 +            xen_pfn_t mfn;      /* MACHINE page number of console page.   */
99477 +            uint32_t  evtchn;   /* Event channel for console page.        */
99478 +        } domU;
99479 +        struct {
99480 +            uint32_t info_off;  /* Offset of console_info struct.         */
99481 +            uint32_t info_size; /* Size of console_info struct from start.*/
99482 +        } dom0;
99483 +    } console;
99484 +    /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
99485 +    unsigned long pt_base;      /* VIRTUAL address of page directory.     */
99486 +    unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
99487 +    unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
99488 +    unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
99489 +    unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
99490 +    int8_t cmd_line[MAX_GUEST_CMDLINE];
99491 +};
99492 +typedef struct start_info start_info_t;
99493 +
99494 +/* New console union for dom0 introduced in 0x00030203. */
99495 +#if __XEN_INTERFACE_VERSION__ < 0x00030203
99496 +#define console_mfn    console.domU.mfn
99497 +#define console_evtchn console.domU.evtchn
99498 +#endif
99499 +
99500 +/* These flags are passed in the 'flags' field of start_info_t. */
99501 +#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
99502 +#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
99503 +
99504 +typedef struct dom0_vga_console_info {
99505 +    uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
99506 +#define XEN_VGATYPE_TEXT_MODE_3 0x03
99507 +#define XEN_VGATYPE_VESA_LFB    0x23
99508 +
99509 +    union {
99510 +        struct {
99511 +            /* Font height, in pixels. */
99512 +            uint16_t font_height;
99513 +            /* Cursor location (column, row). */
99514 +            uint16_t cursor_x, cursor_y;
99515 +            /* Number of rows and columns (dimensions in characters). */
99516 +            uint16_t rows, columns;
99517 +        } text_mode_3;
99518 +
99519 +        struct {
99520 +            /* Width and height, in pixels. */
99521 +            uint16_t width, height;
99522 +            /* Bytes per scan line. */
99523 +            uint16_t bytes_per_line;
99524 +            /* Bits per pixel. */
99525 +            uint16_t bits_per_pixel;
99526 +            /* LFB physical address, and size (in units of 64kB). */
99527 +            uint32_t lfb_base;
99528 +            uint32_t lfb_size;
99529 +            /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
99530 +            uint8_t  red_pos, red_size;
99531 +            uint8_t  green_pos, green_size;
99532 +            uint8_t  blue_pos, blue_size;
99533 +            uint8_t  rsvd_pos, rsvd_size;
99534 +        } vesa_lfb;
99535 +    } u;
99536 +} dom0_vga_console_info_t;
99537 +
99538 +typedef uint8_t xen_domain_handle_t[16];
99539 +
99540 +/* Turn a plain number into a C unsigned long constant. */
99541 +#define __mk_unsigned_long(x) x ## UL
99542 +#define mk_unsigned_long(x) __mk_unsigned_long(x)
99543 +
99544 +DEFINE_XEN_GUEST_HANDLE(uint8_t);
99545 +DEFINE_XEN_GUEST_HANDLE(uint16_t);
99546 +DEFINE_XEN_GUEST_HANDLE(uint32_t);
99547 +DEFINE_XEN_GUEST_HANDLE(uint64_t);
99548 +
99549 +#else /* __ASSEMBLY__ */
99550 +
99551 +/* In assembly code we cannot use C numeric constant suffixes. */
99552 +#define mk_unsigned_long(x) x
99553 +
99554 +#endif /* !__ASSEMBLY__ */
99555 +
99556 +#endif /* __XEN_PUBLIC_XEN_H__ */
99557 +
99558 +/*
99559 + * Local variables:
99560 + * mode: C
99561 + * c-set-style: "BSD"
99562 + * c-basic-offset: 4
99563 + * tab-width: 4
99564 + * indent-tabs-mode: nil
99565 + * End:
99566 + */
99567 diff -ruNp linux-2.6.19/include/xen/interface/xencomm.h linux-2.6.19-xen-3.0.4/include/xen/interface/xencomm.h
99568 --- linux-2.6.19/include/xen/interface/xencomm.h        1970-01-01 00:00:00.000000000 +0000
99569 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/xencomm.h      2007-02-02 19:11:00.000000000 +0000
99570 @@ -0,0 +1,41 @@
99571 +/*
99572 + * Permission is hereby granted, free of charge, to any person obtaining a copy
99573 + * of this software and associated documentation files (the "Software"), to
99574 + * deal in the Software without restriction, including without limitation the
99575 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
99576 + * sell copies of the Software, and to permit persons to whom the Software is
99577 + * furnished to do so, subject to the following conditions:
99578 + *
99579 + * The above copyright notice and this permission notice shall be included in
99580 + * all copies or substantial portions of the Software.
99581 + *
99582 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
99583 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
99584 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
99585 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
99586 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
99587 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
99588 + * DEALINGS IN THE SOFTWARE.
99589 + *
99590 + * Copyright (C) IBM Corp. 2006
99591 + */
99592 +
99593 +#ifndef _XEN_XENCOMM_H_
99594 +#define _XEN_XENCOMM_H_
99595 +
99596 +/* A xencomm descriptor is a scatter/gather list containing physical
99597 + * addresses corresponding to a virtually contiguous memory area. The
99598 + * hypervisor translates these physical addresses to machine addresses to copy
99599 + * to and from the virtually contiguous area.
99600 + */
99601 +
99602 +#define XENCOMM_MAGIC 0x58434F4D /* 'XCOM' */
99603 +#define XENCOMM_INVALID (~0UL)
99604 +
99605 +struct xencomm_desc {
99606 +    uint32_t magic;
99607 +    uint32_t nr_addrs; /* the number of entries in address[] */
99608 +    uint64_t address[0];
99609 +};
99610 +
99611 +#endif /* _XEN_XENCOMM_H_ */
99612 diff -ruNp linux-2.6.19/include/xen/interface/xenoprof.h linux-2.6.19-xen-3.0.4/include/xen/interface/xenoprof.h
99613 --- linux-2.6.19/include/xen/interface/xenoprof.h       1970-01-01 00:00:00.000000000 +0000
99614 +++ linux-2.6.19-xen-3.0.4/include/xen/interface/xenoprof.h     2007-02-02 19:11:00.000000000 +0000
99615 @@ -0,0 +1,130 @@
99616 +/******************************************************************************
99617 + * xenoprof.h
99618 + * 
99619 + * Interface for enabling system wide profiling based on hardware performance
99620 + * counters
99621 + * 
99622 + * Permission is hereby granted, free of charge, to any person obtaining a copy
99623 + * of this software and associated documentation files (the "Software"), to
99624 + * deal in the Software without restriction, including without limitation the
99625 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
99626 + * sell copies of the Software, and to permit persons to whom the Software is
99627 + * furnished to do so, subject to the following conditions:
99628 + *
99629 + * The above copyright notice and this permission notice shall be included in
99630 + * all copies or substantial portions of the Software.
99631 + *
99632 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
99633 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
99634 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
99635 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
99636 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
99637 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
99638 + * DEALINGS IN THE SOFTWARE.
99639 + *
99640 + * Copyright (C) 2005 Hewlett-Packard Co.
99641 + * Written by Aravind Menon & Jose Renato Santos
99642 + */
99643 +
99644 +#ifndef __XEN_PUBLIC_XENOPROF_H__
99645 +#define __XEN_PUBLIC_XENOPROF_H__
99646 +
99647 +#include "xen.h"
99648 +
99649 +/*
99650 + * Commands to HYPERVISOR_xenoprof_op().
99651 + */
99652 +#define XENOPROF_init                0
99653 +#define XENOPROF_reset_active_list   1
99654 +#define XENOPROF_reset_passive_list  2
99655 +#define XENOPROF_set_active          3
99656 +#define XENOPROF_set_passive         4
99657 +#define XENOPROF_reserve_counters    5
99658 +#define XENOPROF_counter             6
99659 +#define XENOPROF_setup_events        7
99660 +#define XENOPROF_enable_virq         8
99661 +#define XENOPROF_start               9
99662 +#define XENOPROF_stop               10
99663 +#define XENOPROF_disable_virq       11
99664 +#define XENOPROF_release_counters   12
99665 +#define XENOPROF_shutdown           13
99666 +#define XENOPROF_get_buffer         14
99667 +#define XENOPROF_last_op            14
99668 +
99669 +#define MAX_OPROF_EVENTS    32
99670 +#define MAX_OPROF_DOMAINS   25
99671 +#define XENOPROF_CPU_TYPE_SIZE 64
99672 +
99673 +/* Xenoprof performance events (not Xen events) */
99674 +struct event_log {
99675 +    uint64_t eip;
99676 +    uint8_t mode;
99677 +    uint8_t event;
99678 +};
99679 +
99680 +/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
99681 +struct xenoprof_buf {
99682 +    uint32_t event_head;
99683 +    uint32_t event_tail;
99684 +    uint32_t event_size;
99685 +    uint32_t vcpu_id;
99686 +    uint64_t xen_samples;
99687 +    uint64_t kernel_samples;
99688 +    uint64_t user_samples;
99689 +    uint64_t lost_samples;
99690 +    struct event_log event_log[1];
99691 +};
99692 +typedef struct xenoprof_buf xenoprof_buf_t;
99693 +DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
99694 +
99695 +struct xenoprof_init {
99696 +    int32_t  num_events;
99697 +    int32_t  is_primary;
99698 +    char cpu_type[XENOPROF_CPU_TYPE_SIZE];
99699 +};
99700 +typedef struct xenoprof_init xenoprof_init_t;
99701 +DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
99702 +
99703 +struct xenoprof_get_buffer {
99704 +    int32_t  max_samples;
99705 +    int32_t  nbuf;
99706 +    int32_t  bufsize;
99707 +    uint64_t buf_gmaddr;
99708 +};
99709 +typedef struct xenoprof_get_buffer xenoprof_get_buffer_t;
99710 +DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t);
99711 +
99712 +struct xenoprof_counter {
99713 +    uint32_t ind;
99714 +    uint64_t count;
99715 +    uint32_t enabled;
99716 +    uint32_t event;
99717 +    uint32_t hypervisor;
99718 +    uint32_t kernel;
99719 +    uint32_t user;
99720 +    uint64_t unit_mask;
99721 +};
99722 +typedef struct xenoprof_counter xenoprof_counter_t;
99723 +DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
99724 +
99725 +typedef struct xenoprof_passive {
99726 +    uint16_t domain_id;
99727 +    int32_t  max_samples;
99728 +    int32_t  nbuf;
99729 +    int32_t  bufsize;
99730 +    uint64_t buf_gmaddr;
99731 +} xenoprof_passive_t;
99732 +DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
99733 +
99734 +
99735 +#endif /* __XEN_PUBLIC_XENOPROF_H__ */
99736 +
99737 +/*
99738 + * Local variables:
99739 + * mode: C
99740 + * c-set-style: "BSD"
99741 + * c-basic-offset: 4
99742 + * tab-width: 4
99743 + * indent-tabs-mode: nil
99744 + * End:
99745 + */
99746 diff -ruNp linux-2.6.19/include/xen/pcifront.h linux-2.6.19-xen-3.0.4/include/xen/pcifront.h
99747 --- linux-2.6.19/include/xen/pcifront.h 1970-01-01 00:00:00.000000000 +0000
99748 +++ linux-2.6.19-xen-3.0.4/include/xen/pcifront.h       2007-02-02 19:11:00.000000000 +0000
99749 @@ -0,0 +1,76 @@
99750 +/*
99751 + * PCI Frontend - arch-dependendent declarations
99752 + *
99753 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
99754 + */
99755 +#ifndef __XEN_ASM_PCIFRONT_H__
99756 +#define __XEN_ASM_PCIFRONT_H__
99757 +
99758 +#include <linux/spinlock.h>
99759 +
99760 +#ifdef __KERNEL__
99761 +
99762 +#ifndef __ia64__
99763 +
99764 +struct pcifront_device;
99765 +struct pci_bus;
99766 +
99767 +struct pcifront_sd {
99768 +       int domain;
99769 +       struct pcifront_device *pdev;
99770 +};
99771 +
99772 +static inline struct pcifront_device *
99773 +pcifront_get_pdev(struct pcifront_sd *sd)
99774 +{
99775 +       return sd->pdev;
99776 +}
99777 +
99778 +static inline void pcifront_init_sd(struct pcifront_sd *sd, int domain,
99779 +                                   struct pcifront_device *pdev)
99780 +{
99781 +       sd->domain = domain;
99782 +       sd->pdev = pdev;
99783 +}
99784 +
99785 +#if defined(CONFIG_PCI_DOMAINS)
99786 +static inline int pci_domain_nr(struct pci_bus *bus)
99787 +{
99788 +       struct pcifront_sd *sd = bus->sysdata;
99789 +       return sd->domain;
99790 +}
99791 +static inline int pci_proc_domain(struct pci_bus *bus)
99792 +{
99793 +       return pci_domain_nr(bus);
99794 +}
99795 +#endif /* CONFIG_PCI_DOMAINS */
99796 +
99797 +#else /* __ia64__ */
99798 +
99799 +#include <asm/pci.h>
99800 +#define pcifront_sd pci_controller
99801 +
99802 +static inline struct pcifront_device *
99803 +pcifront_get_pdev(struct pcifront_sd *sd)
99804 +{
99805 +       return (struct pcifront_device *)sd->platform_data;
99806 +}
99807 +
99808 +static inline void pcifront_init_sd(struct pcifront_sd *sd, int domain,
99809 +                                   struct pcifront_device *pdev)
99810 +{
99811 +       sd->segment = domain;
99812 +       sd->acpi_handle = NULL;
99813 +       sd->iommu = NULL;
99814 +       sd->windows = 0;
99815 +       sd->window = NULL;
99816 +       sd->platform_data = pdev;
99817 +}
99818 +
99819 +#endif /* __ia64__ */
99820 +
99821 +extern struct rw_semaphore pci_bus_sem;
99822 +
99823 +#endif /* __KERNEL__ */
99824 +
99825 +#endif /* __XEN_ASM_PCIFRONT_H__ */
99826 diff -ruNp linux-2.6.19/include/xen/public/evtchn.h linux-2.6.19-xen-3.0.4/include/xen/public/evtchn.h
99827 --- linux-2.6.19/include/xen/public/evtchn.h    1970-01-01 00:00:00.000000000 +0000
99828 +++ linux-2.6.19-xen-3.0.4/include/xen/public/evtchn.h  2007-02-02 19:11:00.000000000 +0000
99829 @@ -0,0 +1,88 @@
99830 +/******************************************************************************
99831 + * evtchn.h
99832 + * 
99833 + * Interface to /dev/xen/evtchn.
99834 + * 
99835 + * Copyright (c) 2003-2005, K A Fraser
99836 + * 
99837 + * This program is free software; you can redistribute it and/or
99838 + * modify it under the terms of the GNU General Public License version 2
99839 + * as published by the Free Software Foundation; or, when distributed
99840 + * separately from the Linux kernel or incorporated into other
99841 + * software packages, subject to the following license:
99842 + * 
99843 + * Permission is hereby granted, free of charge, to any person obtaining a copy
99844 + * of this source file (the "Software"), to deal in the Software without
99845 + * restriction, including without limitation the rights to use, copy, modify,
99846 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
99847 + * and to permit persons to whom the Software is furnished to do so, subject to
99848 + * the following conditions:
99849 + * 
99850 + * The above copyright notice and this permission notice shall be included in
99851 + * all copies or substantial portions of the Software.
99852 + * 
99853 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
99854 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
99855 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
99856 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
99857 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
99858 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
99859 + * IN THE SOFTWARE.
99860 + */
99861 +
99862 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
99863 +#define __LINUX_PUBLIC_EVTCHN_H__
99864 +
99865 +/*
99866 + * Bind a fresh port to VIRQ @virq.
99867 + * Return allocated port.
99868 + */
99869 +#define IOCTL_EVTCHN_BIND_VIRQ                         \
99870 +       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
99871 +struct ioctl_evtchn_bind_virq {
99872 +       unsigned int virq;
99873 +};
99874 +
99875 +/*
99876 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
99877 + * Return allocated port.
99878 + */
99879 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
99880 +       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
99881 +struct ioctl_evtchn_bind_interdomain {
99882 +       unsigned int remote_domain, remote_port;
99883 +};
99884 +
99885 +/*
99886 + * Allocate a fresh port for binding to @remote_domain.
99887 + * Return allocated port.
99888 + */
99889 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
99890 +       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
99891 +struct ioctl_evtchn_bind_unbound_port {
99892 +       unsigned int remote_domain;
99893 +};
99894 +
99895 +/*
99896 + * Unbind previously allocated @port.
99897 + */
99898 +#define IOCTL_EVTCHN_UNBIND                            \
99899 +       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
99900 +struct ioctl_evtchn_unbind {
99901 +       unsigned int port;
99902 +};
99903 +
99904 +/*
99905 + * Unbind previously allocated @port.
99906 + */
99907 +#define IOCTL_EVTCHN_NOTIFY                            \
99908 +       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
99909 +struct ioctl_evtchn_notify {
99910 +       unsigned int port;
99911 +};
99912 +
99913 +/* Clear and reinitialise the event buffer. Clear error condition. */
99914 +#define IOCTL_EVTCHN_RESET                             \
99915 +       _IOC(_IOC_NONE, 'E', 5, 0)
99916 +
99917 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
99918 diff -ruNp linux-2.6.19/include/xen/public/privcmd.h linux-2.6.19-xen-3.0.4/include/xen/public/privcmd.h
99919 --- linux-2.6.19/include/xen/public/privcmd.h   1970-01-01 00:00:00.000000000 +0000
99920 +++ linux-2.6.19-xen-3.0.4/include/xen/public/privcmd.h 2007-02-02 19:11:00.000000000 +0000
99921 @@ -0,0 +1,79 @@
99922 +/******************************************************************************
99923 + * privcmd.h
99924 + * 
99925 + * Interface to /proc/xen/privcmd.
99926 + * 
99927 + * Copyright (c) 2003-2005, K A Fraser
99928 + * 
99929 + * This program is free software; you can redistribute it and/or
99930 + * modify it under the terms of the GNU General Public License version 2
99931 + * as published by the Free Software Foundation; or, when distributed
99932 + * separately from the Linux kernel or incorporated into other
99933 + * software packages, subject to the following license:
99934 + * 
99935 + * Permission is hereby granted, free of charge, to any person obtaining a copy
99936 + * of this source file (the "Software"), to deal in the Software without
99937 + * restriction, including without limitation the rights to use, copy, modify,
99938 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
99939 + * and to permit persons to whom the Software is furnished to do so, subject to
99940 + * the following conditions:
99941 + * 
99942 + * The above copyright notice and this permission notice shall be included in
99943 + * all copies or substantial portions of the Software.
99944 + * 
99945 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
99946 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
99947 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
99948 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
99949 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
99950 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
99951 + * IN THE SOFTWARE.
99952 + */
99953 +
99954 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
99955 +#define __LINUX_PUBLIC_PRIVCMD_H__
99956 +
99957 +#include <linux/types.h>
99958 +
99959 +#ifndef __user
99960 +#define __user
99961 +#endif
99962 +
99963 +typedef struct privcmd_hypercall
99964 +{
99965 +       __u64 op;
99966 +       __u64 arg[5];
99967 +} privcmd_hypercall_t;
99968 +
99969 +typedef struct privcmd_mmap_entry {
99970 +       __u64 va;
99971 +       __u64 mfn;
99972 +       __u64 npages;
99973 +} privcmd_mmap_entry_t; 
99974 +
99975 +typedef struct privcmd_mmap {
99976 +       int num;
99977 +       domid_t dom; /* target domain */
99978 +       privcmd_mmap_entry_t __user *entry;
99979 +} privcmd_mmap_t; 
99980 +
99981 +typedef struct privcmd_mmapbatch {
99982 +       int num;     /* number of pages to populate */
99983 +       domid_t dom; /* target domain */
99984 +       __u64 addr;  /* virtual address */
99985 +       xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
99986 +} privcmd_mmapbatch_t; 
99987 +
99988 +/*
99989 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
99990 + * @arg: &privcmd_hypercall_t
99991 + * Return: Value returned from execution of the specified hypercall.
99992 + */
99993 +#define IOCTL_PRIVCMD_HYPERCALL                                        \
99994 +       _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
99995 +#define IOCTL_PRIVCMD_MMAP                                     \
99996 +       _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
99997 +#define IOCTL_PRIVCMD_MMAPBATCH                                        \
99998 +       _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
99999 +
100000 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
100001 diff -ruNp linux-2.6.19/include/xen/xen_proc.h linux-2.6.19-xen-3.0.4/include/xen/xen_proc.h
100002 --- linux-2.6.19/include/xen/xen_proc.h 1970-01-01 00:00:00.000000000 +0000
100003 +++ linux-2.6.19-xen-3.0.4/include/xen/xen_proc.h       2007-02-02 19:11:00.000000000 +0000
100004 @@ -0,0 +1,12 @@
100005 +
100006 +#ifndef __ASM_XEN_PROC_H__
100007 +#define __ASM_XEN_PROC_H__
100008 +
100009 +#include <linux/proc_fs.h>
100010 +
100011 +extern struct proc_dir_entry *create_xen_proc_entry(
100012 +       const char *name, mode_t mode);
100013 +extern void remove_xen_proc_entry(
100014 +       const char *name);
100015 +
100016 +#endif /* __ASM_XEN_PROC_H__ */
100017 diff -ruNp linux-2.6.19/include/xen/xenbus.h linux-2.6.19-xen-3.0.4/include/xen/xenbus.h
100018 --- linux-2.6.19/include/xen/xenbus.h   1970-01-01 00:00:00.000000000 +0000
100019 +++ linux-2.6.19-xen-3.0.4/include/xen/xenbus.h 2007-02-02 19:11:00.000000000 +0000
100020 @@ -0,0 +1,307 @@
100021 +/******************************************************************************
100022 + * xenbus.h
100023 + *
100024 + * Talks to Xen Store to figure out what devices we have.
100025 + *
100026 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
100027 + * Copyright (C) 2005 XenSource Ltd.
100028 + * 
100029 + * This program is free software; you can redistribute it and/or
100030 + * modify it under the terms of the GNU General Public License version 2
100031 + * as published by the Free Software Foundation; or, when distributed
100032 + * separately from the Linux kernel or incorporated into other
100033 + * software packages, subject to the following license:
100034 + * 
100035 + * Permission is hereby granted, free of charge, to any person obtaining a copy
100036 + * of this source file (the "Software"), to deal in the Software without
100037 + * restriction, including without limitation the rights to use, copy, modify,
100038 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
100039 + * and to permit persons to whom the Software is furnished to do so, subject to
100040 + * the following conditions:
100041 + * 
100042 + * The above copyright notice and this permission notice shall be included in
100043 + * all copies or substantial portions of the Software.
100044 + * 
100045 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
100046 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100047 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100048 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
100049 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
100050 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
100051 + * IN THE SOFTWARE.
100052 + */
100053 +
100054 +#ifndef _XEN_XENBUS_H
100055 +#define _XEN_XENBUS_H
100056 +
100057 +#include <linux/device.h>
100058 +#include <linux/notifier.h>
100059 +#include <linux/mutex.h>
100060 +#include <linux/completion.h>
100061 +#include <linux/init.h>
100062 +#include <xen/interface/xen.h>
100063 +#include <xen/interface/grant_table.h>
100064 +#include <xen/interface/io/xenbus.h>
100065 +#include <xen/interface/io/xs_wire.h>
100066 +
100067 +/* Register callback to watch this node. */
100068 +struct xenbus_watch
100069 +{
100070 +       struct list_head list;
100071 +
100072 +       /* Path being watched. */
100073 +       const char *node;
100074 +
100075 +       /* Callback (executed in a process context with no locks held). */
100076 +       void (*callback)(struct xenbus_watch *,
100077 +                        const char **vec, unsigned int len);
100078 +
100079 +       /* See XBWF_ definitions below. */
100080 +       unsigned long flags;
100081 +};
100082 +
100083 +/*
100084 + * Execute callback in its own kthread. Useful if the callback is long
100085 + * running or heavily serialised, to avoid taking out the main xenwatch thread
100086 + * for a long period of time (or even unwittingly causing a deadlock).
100087 + */
100088 +#define XBWF_new_thread        1
100089 +
100090 +/* A xenbus device. */
100091 +struct xenbus_device {
100092 +       const char *devicetype;
100093 +       const char *nodename;
100094 +       const char *otherend;
100095 +       int otherend_id;
100096 +       struct xenbus_watch otherend_watch;
100097 +       struct device dev;
100098 +       enum xenbus_state state;
100099 +       struct completion down;
100100 +};
100101 +
100102 +static inline struct xenbus_device *to_xenbus_device(struct device *dev)
100103 +{
100104 +       return container_of(dev, struct xenbus_device, dev);
100105 +}
100106 +
100107 +struct xenbus_device_id
100108 +{
100109 +       /* .../device/<device_type>/<identifier> */
100110 +       char devicetype[32];    /* General class of device. */
100111 +};
100112 +
100113 +/* A xenbus driver. */
100114 +struct xenbus_driver {
100115 +       char *name;
100116 +       struct module *owner;
100117 +       const struct xenbus_device_id *ids;
100118 +       int (*probe)(struct xenbus_device *dev,
100119 +                    const struct xenbus_device_id *id);
100120 +       void (*otherend_changed)(struct xenbus_device *dev,
100121 +                                enum xenbus_state backend_state);
100122 +       int (*remove)(struct xenbus_device *dev);
100123 +       int (*suspend)(struct xenbus_device *dev);
100124 +       int (*resume)(struct xenbus_device *dev);
100125 +       int (*uevent)(struct xenbus_device *, char **, int, char *, int);
100126 +       struct device_driver driver;
100127 +       int (*read_otherend_details)(struct xenbus_device *dev);
100128 +};
100129 +
100130 +static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
100131 +{
100132 +       return container_of(drv, struct xenbus_driver, driver);
100133 +}
100134 +
100135 +int xenbus_register_frontend(struct xenbus_driver *drv);
100136 +int xenbus_register_backend(struct xenbus_driver *drv);
100137 +void xenbus_unregister_driver(struct xenbus_driver *drv);
100138 +
100139 +struct xenbus_transaction
100140 +{
100141 +       u32 id;
100142 +};
100143 +
100144 +/* Nil transaction ID. */
100145 +#define XBT_NIL ((struct xenbus_transaction) { 0 })
100146 +
100147 +char **xenbus_directory(struct xenbus_transaction t,
100148 +                       const char *dir, const char *node, unsigned int *num);
100149 +void *xenbus_read(struct xenbus_transaction t,
100150 +                 const char *dir, const char *node, unsigned int *len);
100151 +int xenbus_write(struct xenbus_transaction t,
100152 +                const char *dir, const char *node, const char *string);
100153 +int xenbus_mkdir(struct xenbus_transaction t,
100154 +                const char *dir, const char *node);
100155 +int xenbus_exists(struct xenbus_transaction t,
100156 +                 const char *dir, const char *node);
100157 +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
100158 +int xenbus_transaction_start(struct xenbus_transaction *t);
100159 +int xenbus_transaction_end(struct xenbus_transaction t, int abort);
100160 +
100161 +/* Single read and scanf: returns -errno or num scanned if > 0. */
100162 +int xenbus_scanf(struct xenbus_transaction t,
100163 +                const char *dir, const char *node, const char *fmt, ...)
100164 +       __attribute__((format(scanf, 4, 5)));
100165 +
100166 +/* Single printf and write: returns -errno or 0. */
100167 +int xenbus_printf(struct xenbus_transaction t,
100168 +                 const char *dir, const char *node, const char *fmt, ...)
100169 +       __attribute__((format(printf, 4, 5)));
100170 +
100171 +/* Generic read function: NULL-terminated triples of name,
100172 + * sprintf-style type string, and pointer. Returns 0 or errno.*/
100173 +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
100174 +
100175 +/* notifer routines for when the xenstore comes up */
100176 +int register_xenstore_notifier(struct notifier_block *nb);
100177 +void unregister_xenstore_notifier(struct notifier_block *nb);
100178 +
100179 +int register_xenbus_watch(struct xenbus_watch *watch);
100180 +void unregister_xenbus_watch(struct xenbus_watch *watch);
100181 +void xs_suspend(void);
100182 +void xs_resume(void);
100183 +
100184 +/* Used by xenbus_dev to borrow kernel's store connection. */
100185 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
100186 +
100187 +/* Called from xen core code. */
100188 +void xenbus_suspend(void);
100189 +void xenbus_resume(void);
100190 +
100191 +#define XENBUS_IS_ERR_READ(str) ({                     \
100192 +       if (!IS_ERR(str) && strlen(str) == 0) {         \
100193 +               kfree(str);                             \
100194 +               str = ERR_PTR(-ERANGE);                 \
100195 +       }                                               \
100196 +       IS_ERR(str);                                    \
100197 +})
100198 +
100199 +#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
100200 +
100201 +
100202 +/**
100203 + * Register a watch on the given path, using the given xenbus_watch structure
100204 + * for storage, and the given callback function as the callback.  Return 0 on
100205 + * success, or -errno on error.  On success, the given path will be saved as
100206 + * watch->node, and remains the caller's to free.  On error, watch->node will
100207 + * be NULL, the device will switch to XenbusStateClosing, and the error will
100208 + * be saved in the store.
100209 + */
100210 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
100211 +                     struct xenbus_watch *watch,
100212 +                     void (*callback)(struct xenbus_watch *,
100213 +                                      const char **, unsigned int));
100214 +
100215 +
100216 +/**
100217 + * Register a watch on the given path/path2, using the given xenbus_watch
100218 + * structure for storage, and the given callback function as the callback.
100219 + * Return 0 on success, or -errno on error.  On success, the watched path
100220 + * (path/path2) will be saved as watch->node, and becomes the caller's to
100221 + * kfree().  On error, watch->node will be NULL, so the caller has nothing to
100222 + * free, the device will switch to XenbusStateClosing, and the error will be
100223 + * saved in the store.
100224 + */
100225 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
100226 +                      const char *path2, struct xenbus_watch *watch,
100227 +                      void (*callback)(struct xenbus_watch *,
100228 +                                       const char **, unsigned int));
100229 +
100230 +
100231 +/**
100232 + * Advertise in the store a change of the given driver to the given new_state.
100233 + * Return 0 on success, or -errno on error.  On error, the device will switch
100234 + * to XenbusStateClosing, and the error will be saved in the store.
100235 + */
100236 +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
100237 +
100238 +
100239 +/**
100240 + * Grant access to the given ring_mfn to the peer of the given device.  Return
100241 + * 0 on success, or -errno on error.  On error, the device will switch to
100242 + * XenbusStateClosing, and the error will be saved in the store.
100243 + */
100244 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
100245 +
100246 +
100247 +/**
100248 + * Map a page of memory into this domain from another domain's grant table.
100249 + * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
100250 + * page to that address, and sets *vaddr to that address.
100251 + * xenbus_map_ring does not allocate the virtual address space (you must do
100252 + * this yourself!). It only maps in the page to the specified address.
100253 + * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
100254 + * or -ENOMEM on error. If an error is returned, device will switch to
100255 + * XenbusStateClosing and the error message will be saved in XenStore.
100256 + */
100257 +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
100258 +                                        int gnt_ref);
100259 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
100260 +                          grant_handle_t *handle, void *vaddr);
100261 +
100262 +
100263 +/**
100264 + * Unmap a page of memory in this domain that was imported from another domain.
100265 + * Use xenbus_unmap_ring_vfree if you mapped in your memory with
100266 + * xenbus_map_ring_valloc (it will free the virtual address space).
100267 + * Returns 0 on success and returns GNTST_* on error
100268 + * (see xen/include/interface/grant_table.h).
100269 + */
100270 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
100271 +int xenbus_unmap_ring(struct xenbus_device *dev,
100272 +                     grant_handle_t handle, void *vaddr);
100273 +
100274 +
100275 +/**
100276 + * Allocate an event channel for the given xenbus_device, assigning the newly
100277 + * created local port to *port.  Return 0 on success, or -errno on error.  On
100278 + * error, the device will switch to XenbusStateClosing, and the error will be
100279 + * saved in the store.
100280 + */
100281 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
100282 +
100283 +
100284 +/**
100285 + * Bind to an existing interdomain event channel in another domain. Returns 0
100286 + * on success and stores the local port in *port. On error, returns -errno,
100287 + * switches the device to XenbusStateClosing, and saves the error in XenStore.
100288 + */
100289 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
100290 +
100291 +
100292 +/**
100293 + * Free an existing event channel. Returns 0 on success or -errno on error.
100294 + */
100295 +int xenbus_free_evtchn(struct xenbus_device *dev, int port);
100296 +
100297 +
100298 +/**
100299 + * Return the state of the driver rooted at the given store path, or
100300 + * XenbusStateUnknown if no state can be read.
100301 + */
100302 +enum xenbus_state xenbus_read_driver_state(const char *path);
100303 +
100304 +
100305 +/***
100306 + * Report the given negative errno into the store, along with the given
100307 + * formatted message.
100308 + */
100309 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
100310 +                     ...);
100311 +
100312 +
100313 +/***
100314 + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
100315 + * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
100316 + * closedown of this driver and its peer.
100317 + */
100318 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
100319 +                     ...);
100320 +
100321 +int __init xenbus_dev_init(void);
100322 +
100323 +char *xenbus_strstate(enum xenbus_state state);
100324 +int xenbus_dev_is_online(struct xenbus_device *dev);
100325 +int xenbus_frontend_closed(struct xenbus_device *dev);
100326 +
100327 +#endif /* _XEN_XENBUS_H */
100328 diff -ruNp linux-2.6.19/include/xen/xencons.h linux-2.6.19-xen-3.0.4/include/xen/xencons.h
100329 --- linux-2.6.19/include/xen/xencons.h  1970-01-01 00:00:00.000000000 +0000
100330 +++ linux-2.6.19-xen-3.0.4/include/xen/xencons.h        2007-02-02 19:11:00.000000000 +0000
100331 @@ -0,0 +1,19 @@
100332 +#ifndef __ASM_XENCONS_H__
100333 +#define __ASM_XENCONS_H__
100334 +
100335 +struct dom0_vga_console_info;
100336 +void dom0_init_screen_info(const struct dom0_vga_console_info *info);
100337 +
100338 +void xencons_force_flush(void);
100339 +void xencons_resume(void);
100340 +
100341 +/* Interrupt work hooks. Receive data, or kick data out. */
100342 +void xencons_rx(char *buf, unsigned len);
100343 +void xencons_tx(void);
100344 +
100345 +int xencons_ring_init(void);
100346 +int xencons_ring_send(const char *data, unsigned len);
100347 +
100348 +void xencons_early_setup(void);
100349 +
100350 +#endif /* __ASM_XENCONS_H__ */
100351 diff -ruNp linux-2.6.19/include/xen/xenoprof.h linux-2.6.19-xen-3.0.4/include/xen/xenoprof.h
100352 --- linux-2.6.19/include/xen/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
100353 +++ linux-2.6.19-xen-3.0.4/include/xen/xenoprof.h       2007-02-02 19:11:00.000000000 +0000
100354 @@ -0,0 +1,42 @@
100355 +/******************************************************************************
100356 + * xen/xenoprof.h
100357 + *
100358 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
100359 + *                    VA Linux Systems Japan K.K.
100360 + *
100361 + * This program is free software; you can redistribute it and/or modify
100362 + * it under the terms of the GNU General Public License as published by
100363 + * the Free Software Foundation; either version 2 of the License, or
100364 + * (at your option) any later version.
100365 + *
100366 + * This program is distributed in the hope that it will be useful,
100367 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
100368 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
100369 + * GNU General Public License for more details.
100370 + *
100371 + * You should have received a copy of the GNU General Public License
100372 + * along with this program; if not, write to the Free Software
100373 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
100374 + *
100375 + */
100376 +
100377 +#ifndef __XEN_XENOPROF_H__
100378 +#define __XEN_XENOPROF_H__
100379 +#ifdef CONFIG_XEN
100380 +
100381 +#include <asm/xenoprof.h>
100382 +
100383 +struct oprofile_operations;
100384 +int xenoprofile_init(struct oprofile_operations * ops);
100385 +void xenoprofile_exit(void);
100386 +
100387 +struct xenoprof_shared_buffer {
100388 +       char                                    *buffer;
100389 +       struct xenoprof_arch_shared_buffer      arch;
100390 +};
100391 +#else
100392 +#define xenoprofile_init(ops)  (-ENOSYS)
100393 +#define xenoprofile_exit()     do { } while (0)
100394 +
100395 +#endif /* CONFIG_XEN */
100396 +#endif /* __XEN_XENOPROF_H__ */
100397 diff -ruNp linux-2.6.19/kernel/Kconfig.preempt linux-2.6.19-xen-3.0.4/kernel/Kconfig.preempt
100398 --- linux-2.6.19/kernel/Kconfig.preempt 2006-11-29 21:57:37.000000000 +0000
100399 +++ linux-2.6.19-xen-3.0.4/kernel/Kconfig.preempt       2007-02-02 19:11:00.000000000 +0000
100400 @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
100401  
100402  config PREEMPT
100403         bool "Preemptible Kernel (Low-Latency Desktop)"
100404 +       depends on !XEN
100405         help
100406           This option reduces the latency of the kernel by making
100407           all kernel code (that is not executing in a critical section)
100408 diff -ruNp linux-2.6.19/kernel/fork.c linux-2.6.19-xen-3.0.4/kernel/fork.c
100409 --- linux-2.6.19/kernel/fork.c  2006-11-29 21:57:37.000000000 +0000
100410 +++ linux-2.6.19-xen-3.0.4/kernel/fork.c        2007-02-02 19:11:00.000000000 +0000
100411 @@ -285,6 +285,9 @@ static inline int dup_mmap(struct mm_str
100412                 if (retval)
100413                         goto out;
100414         }
100415 +#ifdef arch_dup_mmap
100416 +       arch_dup_mmap(mm, oldmm);
100417 +#endif
100418         retval = 0;
100419  out:
100420         up_write(&mm->mmap_sem);
100421 diff -ruNp linux-2.6.19/kernel/irq/manage.c linux-2.6.19-xen-3.0.4/kernel/irq/manage.c
100422 --- linux-2.6.19/kernel/irq/manage.c    2006-11-29 21:57:37.000000000 +0000
100423 +++ linux-2.6.19-xen-3.0.4/kernel/irq/manage.c  2007-02-02 19:11:00.000000000 +0000
100424 @@ -355,7 +355,6 @@ void free_irq(unsigned int irq, void *de
100425         struct irqaction **p;
100426         unsigned long flags;
100427  
100428 -       WARN_ON(in_interrupt());
100429         if (irq >= NR_IRQS)
100430                 return;
100431  
100432 diff -ruNp linux-2.6.19/kernel/kexec.c linux-2.6.19-xen-3.0.4/kernel/kexec.c
100433 --- linux-2.6.19/kernel/kexec.c 2006-11-29 21:57:37.000000000 +0000
100434 +++ linux-2.6.19-xen-3.0.4/kernel/kexec.c       2007-02-02 19:11:00.000000000 +0000
100435 @@ -403,7 +403,7 @@ static struct page *kimage_alloc_normal_
100436                 pages = kimage_alloc_pages(GFP_KERNEL, order);
100437                 if (!pages)
100438                         break;
100439 -               pfn   = page_to_pfn(pages);
100440 +               pfn   = kexec_page_to_pfn(pages);
100441                 epfn  = pfn + count;
100442                 addr  = pfn << PAGE_SHIFT;
100443                 eaddr = epfn << PAGE_SHIFT;
100444 @@ -437,6 +437,7 @@ static struct page *kimage_alloc_normal_
100445         return pages;
100446  }
100447  
100448 +#ifndef CONFIG_XEN
100449  static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
100450                                                       unsigned int order)
100451  {
100452 @@ -490,7 +491,7 @@ static struct page *kimage_alloc_crash_c
100453                 }
100454                 /* If I don't overlap any segments I have found my hole! */
100455                 if (i == image->nr_segments) {
100456 -                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
100457 +                       pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
100458                         break;
100459                 }
100460         }
100461 @@ -517,6 +518,13 @@ struct page *kimage_alloc_control_pages(
100462  
100463         return pages;
100464  }
100465 +#else /* !CONFIG_XEN */
100466 +struct page *kimage_alloc_control_pages(struct kimage *image,
100467 +                                        unsigned int order)
100468 +{
100469 +       return kimage_alloc_normal_control_pages(image, order);
100470 +}
100471 +#endif
100472  
100473  static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
100474  {
100475 @@ -532,7 +540,7 @@ static int kimage_add_entry(struct kimag
100476                         return -ENOMEM;
100477  
100478                 ind_page = page_address(page);
100479 -               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
100480 +               *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
100481                 image->entry = ind_page;
100482                 image->last_entry = ind_page +
100483                                       ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
100484 @@ -593,13 +601,13 @@ static int kimage_terminate(struct kimag
100485  #define for_each_kimage_entry(image, ptr, entry) \
100486         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
100487                 ptr = (entry & IND_INDIRECTION)? \
100488 -                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
100489 +                       kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
100490  
100491  static void kimage_free_entry(kimage_entry_t entry)
100492  {
100493         struct page *page;
100494  
100495 -       page = pfn_to_page(entry >> PAGE_SHIFT);
100496 +       page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
100497         kimage_free_pages(page);
100498  }
100499  
100500 @@ -611,6 +619,10 @@ static void kimage_free(struct kimage *i
100501         if (!image)
100502                 return;
100503  
100504 +#ifdef CONFIG_XEN
100505 +       xen_machine_kexec_unload(image);
100506 +#endif
100507 +
100508         kimage_free_extra_pages(image);
100509         for_each_kimage_entry(image, ptr, entry) {
100510                 if (entry & IND_INDIRECTION) {
100511 @@ -686,7 +698,7 @@ static struct page *kimage_alloc_page(st
100512          * have a match.
100513          */
100514         list_for_each_entry(page, &image->dest_pages, lru) {
100515 -               addr = page_to_pfn(page) << PAGE_SHIFT;
100516 +               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
100517                 if (addr == destination) {
100518                         list_del(&page->lru);
100519                         return page;
100520 @@ -701,12 +713,12 @@ static struct page *kimage_alloc_page(st
100521                 if (!page)
100522                         return NULL;
100523                 /* If the page cannot be used file it away */
100524 -               if (page_to_pfn(page) >
100525 +               if (kexec_page_to_pfn(page) >
100526                                 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
100527                         list_add(&page->lru, &image->unuseable_pages);
100528                         continue;
100529                 }
100530 -               addr = page_to_pfn(page) << PAGE_SHIFT;
100531 +               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
100532  
100533                 /* If it is the destination page we want use it */
100534                 if (addr == destination)
100535 @@ -729,7 +741,7 @@ static struct page *kimage_alloc_page(st
100536                         struct page *old_page;
100537  
100538                         old_addr = *old & PAGE_MASK;
100539 -                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
100540 +                       old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
100541                         copy_highpage(page, old_page);
100542                         *old = addr | (*old & ~PAGE_MASK);
100543  
100544 @@ -779,7 +791,7 @@ static int kimage_load_normal_segment(st
100545                         result  = -ENOMEM;
100546                         goto out;
100547                 }
100548 -               result = kimage_add_page(image, page_to_pfn(page)
100549 +               result = kimage_add_page(image, kexec_page_to_pfn(page)
100550                                                                 << PAGE_SHIFT);
100551                 if (result < 0)
100552                         goto out;
100553 @@ -811,6 +823,7 @@ out:
100554         return result;
100555  }
100556  
100557 +#ifndef CONFIG_XEN
100558  static int kimage_load_crash_segment(struct kimage *image,
100559                                         struct kexec_segment *segment)
100560  {
100561 @@ -833,7 +846,7 @@ static int kimage_load_crash_segment(str
100562                 char *ptr;
100563                 size_t uchunk, mchunk;
100564  
100565 -               page = pfn_to_page(maddr >> PAGE_SHIFT);
100566 +               page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
100567                 if (page == 0) {
100568                         result  = -ENOMEM;
100569                         goto out;
100570 @@ -881,6 +894,13 @@ static int kimage_load_segment(struct ki
100571  
100572         return result;
100573  }
100574 +#else /* CONFIG_XEN */
100575 +static int kimage_load_segment(struct kimage *image,
100576 +                               struct kexec_segment *segment)
100577 +{
100578 +       return kimage_load_normal_segment(image, segment);
100579 +}
100580 +#endif
100581  
100582  /*
100583   * Exec Kernel system call: for obvious reasons only root may call it.
100584 @@ -991,6 +1011,11 @@ asmlinkage long sys_kexec_load(unsigned 
100585                 if (result)
100586                         goto out;
100587         }
100588 +#ifdef CONFIG_XEN
100589 +       result = xen_machine_kexec_load(image);
100590 +       if (result)
100591 +               goto out;
100592 +#endif
100593         /* Install the new kernel, and  Uninstall the old */
100594         image = xchg(dest_image, image);
100595  
100596 @@ -1045,7 +1070,6 @@ void crash_kexec(struct pt_regs *regs)
100597  {
100598         int locked;
100599  
100600 -
100601         /* Take the kexec_lock here to prevent sys_kexec_load
100602          * running on one cpu from replacing the crash kernel
100603          * we are using after a panic on a different cpu.
100604 diff -ruNp linux-2.6.19/lib/Kconfig.debug linux-2.6.19-xen-3.0.4/lib/Kconfig.debug
100605 --- linux-2.6.19/lib/Kconfig.debug      2006-11-29 21:57:37.000000000 +0000
100606 +++ linux-2.6.19-xen-3.0.4/lib/Kconfig.debug    2007-02-02 19:11:01.000000000 +0000
100607 @@ -341,7 +341,7 @@ config FRAME_POINTER
100608  
100609  config UNWIND_INFO
100610         bool "Compile the kernel with frame unwind information"
100611 -       depends on !IA64 && !PARISC && !ARM
100612 +       depends on !IA64 && !PARISC && !ARM && !X86_64_XEN
100613         depends on !MODULES || !(MIPS || PPC || SUPERH || V850)
100614         help
100615           If you say Y here the resulting kernel image will be slightly larger
100616 diff -ruNp linux-2.6.19/lib/Makefile linux-2.6.19-xen-3.0.4/lib/Makefile
100617 --- linux-2.6.19/lib/Makefile   2006-11-29 21:57:37.000000000 +0000
100618 +++ linux-2.6.19-xen-3.0.4/lib/Makefile 2007-02-02 19:11:01.000000000 +0000
100619 @@ -54,6 +54,7 @@ obj-$(CONFIG_SMP) += percpu_counter.o
100620  obj-$(CONFIG_AUDIT_GENERIC) += audit.o
100621  
100622  obj-$(CONFIG_SWIOTLB) += swiotlb.o
100623 +swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
100624  
100625  hostprogs-y    := gen_crc32table
100626  clean-files    := crc32table.h
100627 diff -ruNp linux-2.6.19/mm/Kconfig linux-2.6.19-xen-3.0.4/mm/Kconfig
100628 --- linux-2.6.19/mm/Kconfig     2006-11-29 21:57:37.000000000 +0000
100629 +++ linux-2.6.19-xen-3.0.4/mm/Kconfig   2007-02-02 19:11:01.000000000 +0000
100630 @@ -132,11 +132,14 @@ config MEMORY_HOTPLUG_SPARSE
100631  # Default to 4 for wider testing, though 8 might be more appropriate.
100632  # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
100633  # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
100634 +# XEN on x86 architecture uses the mapping field on pagetable pages to store a
100635 +# pointer to the destructor. This conflicts with pte_lock_deinit().
100636  #
100637  config SPLIT_PTLOCK_CPUS
100638         int
100639         default "4096" if ARM && !CPU_CACHE_VIPT
100640         default "4096" if PARISC && !PA20
100641 +       default "4096" if X86_XEN || X86_64_XEN
100642         default "4"
100643  
100644  #
100645 diff -ruNp linux-2.6.19/mm/highmem.c linux-2.6.19-xen-3.0.4/mm/highmem.c
100646 --- linux-2.6.19/mm/highmem.c   2006-11-29 21:57:37.000000000 +0000
100647 +++ linux-2.6.19-xen-3.0.4/mm/highmem.c 2007-02-02 19:11:01.000000000 +0000
100648 @@ -148,6 +148,17 @@ start:
100649         return vaddr;
100650  }
100651  
100652 +#ifdef CONFIG_XEN
100653 +void kmap_flush_unused(void)
100654 +{
100655 +       spin_lock(&kmap_lock);
100656 +       flush_all_zero_pkmaps();
100657 +       spin_unlock(&kmap_lock);
100658 +}
100659 +
100660 +EXPORT_SYMBOL(kmap_flush_unused);
100661 +#endif
100662 +
100663  void fastcall *kmap_high(struct page *page)
100664  {
100665         unsigned long vaddr;
100666 diff -ruNp linux-2.6.19/mm/memory.c linux-2.6.19-xen-3.0.4/mm/memory.c
100667 --- linux-2.6.19/mm/memory.c    2007-02-02 20:26:43.000000000 +0000
100668 +++ linux-2.6.19-xen-3.0.4/mm/memory.c  2007-02-02 19:11:01.000000000 +0000
100669 @@ -404,7 +404,8 @@ struct page *vm_normal_page(struct vm_ar
100670          * and that the resulting page looks ok.
100671          */
100672         if (unlikely(!pfn_valid(pfn))) {
100673 -               print_bad_pte(vma, pte, addr);
100674 +               if (!(vma->vm_flags & VM_RESERVED))
100675 +                       print_bad_pte(vma, pte, addr);
100676                 return NULL;
100677         }
100678  
100679 @@ -896,6 +897,7 @@ unsigned long zap_page_range(struct vm_a
100680                 tlb_finish_mmu(tlb, address, end);
100681         return end;
100682  }
100683 +EXPORT_SYMBOL(zap_page_range);
100684  
100685  /*
100686   * Do a quick page-table lookup for a single page.
100687 @@ -1035,6 +1037,26 @@ int get_user_pages(struct task_struct *t
100688                         continue;
100689                 }
100690  
100691 +#ifdef CONFIG_XEN
100692 +               if (vma && (vma->vm_flags & VM_FOREIGN)) {
100693 +                       struct page **map = vma->vm_private_data;
100694 +                       int offset = (start - vma->vm_start) >> PAGE_SHIFT;
100695 +                       if (map[offset] != NULL) {
100696 +                               if (pages) {
100697 +                                       struct page *page = map[offset];
100698 +                                       
100699 +                                       pages[i] = page;
100700 +                                       get_page(page);
100701 +                               }
100702 +                               if (vmas)
100703 +                                       vmas[i] = vma;
100704 +                               i++;
100705 +                               start += PAGE_SIZE;
100706 +                               len--;
100707 +                               continue;
100708 +                       }
100709 +               }
100710 +#endif
100711                 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
100712                                 || !(vm_flags & vma->vm_flags))
100713                         return i ? : -EFAULT;
100714 @@ -1403,6 +1425,102 @@ int remap_pfn_range(struct vm_area_struc
100715  }
100716  EXPORT_SYMBOL(remap_pfn_range);
100717  
100718 +#ifdef CONFIG_XEN
100719 +static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
100720 +                                    unsigned long addr, unsigned long end,
100721 +                                    pte_fn_t fn, void *data)
100722 +{
100723 +       pte_t *pte;
100724 +       int err;
100725 +       struct page *pmd_page;
100726 +       spinlock_t *ptl;
100727 +
100728 +       pte = (mm == &init_mm) ?
100729 +               pte_alloc_kernel(pmd, addr) :
100730 +               pte_alloc_map_lock(mm, pmd, addr, &ptl);
100731 +       if (!pte)
100732 +               return -ENOMEM;
100733 +
100734 +       BUG_ON(pmd_huge(*pmd));
100735 +
100736 +       pmd_page = pmd_page(*pmd);
100737 +
100738 +       do {
100739 +               err = fn(pte, pmd_page, addr, data);
100740 +               if (err)
100741 +                       break;
100742 +       } while (pte++, addr += PAGE_SIZE, addr != end);
100743 +
100744 +       if (mm != &init_mm)
100745 +               pte_unmap_unlock(pte-1, ptl);
100746 +       return err;
100747 +}
100748 +
100749 +static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
100750 +                                    unsigned long addr, unsigned long end,
100751 +                                    pte_fn_t fn, void *data)
100752 +{
100753 +       pmd_t *pmd;
100754 +       unsigned long next;
100755 +       int err;
100756 +
100757 +       pmd = pmd_alloc(mm, pud, addr);
100758 +       if (!pmd)
100759 +               return -ENOMEM;
100760 +       do {
100761 +               next = pmd_addr_end(addr, end);
100762 +               err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
100763 +               if (err)
100764 +                       break;
100765 +       } while (pmd++, addr = next, addr != end);
100766 +       return err;
100767 +}
100768 +
100769 +static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
100770 +                                    unsigned long addr, unsigned long end,
100771 +                                    pte_fn_t fn, void *data)
100772 +{
100773 +       pud_t *pud;
100774 +       unsigned long next;
100775 +       int err;
100776 +
100777 +       pud = pud_alloc(mm, pgd, addr);
100778 +       if (!pud)
100779 +               return -ENOMEM;
100780 +       do {
100781 +               next = pud_addr_end(addr, end);
100782 +               err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
100783 +               if (err)
100784 +                       break;
100785 +       } while (pud++, addr = next, addr != end);
100786 +       return err;
100787 +}
100788 +
100789 +/*
100790 + * Scan a region of virtual memory, filling in page tables as necessary
100791 + * and calling a provided function on each leaf page table.
100792 + */
100793 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
100794 +                       unsigned long size, pte_fn_t fn, void *data)
100795 +{
100796 +       pgd_t *pgd;
100797 +       unsigned long next;
100798 +       unsigned long end = addr + size;
100799 +       int err;
100800 +
100801 +       BUG_ON(addr >= end);
100802 +       pgd = pgd_offset(mm, addr);
100803 +       do {
100804 +               next = pgd_addr_end(addr, end);
100805 +               err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
100806 +               if (err)
100807 +                       break;
100808 +       } while (pgd++, addr = next, addr != end);
100809 +       return err;
100810 +}
100811 +EXPORT_SYMBOL_GPL(apply_to_page_range);
100812 +#endif
100813 +
100814  /*
100815   * handle_pte_fault chooses page fault handler according to an entry
100816   * which was read non-atomically.  Before making any commitment, on
100817 diff -ruNp linux-2.6.19/mm/mmap.c linux-2.6.19-xen-3.0.4/mm/mmap.c
100818 --- linux-2.6.19/mm/mmap.c      2006-11-29 21:57:37.000000000 +0000
100819 +++ linux-2.6.19-xen-3.0.4/mm/mmap.c    2007-02-02 19:11:01.000000000 +0000
100820 @@ -1970,6 +1970,10 @@ void exit_mmap(struct mm_struct *mm)
100821         unsigned long nr_accounted = 0;
100822         unsigned long end;
100823  
100824 +#ifdef arch_exit_mmap
100825 +       arch_exit_mmap(mm);
100826 +#endif
100827 +
100828         lru_add_drain();
100829         flush_cache_mm(mm);
100830         tlb = tlb_gather_mmu(mm, 1);
100831 diff -ruNp linux-2.6.19/mm/page_alloc.c linux-2.6.19-xen-3.0.4/mm/page_alloc.c
100832 --- linux-2.6.19/mm/page_alloc.c        2006-11-29 21:57:37.000000000 +0000
100833 +++ linux-2.6.19-xen-3.0.4/mm/page_alloc.c      2007-02-02 19:11:01.000000000 +0000
100834 @@ -496,6 +496,8 @@ static void __free_pages_ok(struct page 
100835         int i;
100836         int reserved = 0;
100837  
100838 +       if (arch_free_page(page, order))
100839 +               return;
100840         for (i = 0 ; i < (1 << order) ; ++i)
100841                 reserved += free_pages_check(page + i);
100842         if (reserved)
100843 @@ -503,7 +505,6 @@ static void __free_pages_ok(struct page 
100844  
100845         if (!PageHighMem(page))
100846                 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
100847 -       arch_free_page(page, order);
100848         kernel_map_pages(page, 1 << order, 0);
100849  
100850         local_irq_save(flags);
100851 @@ -781,6 +782,9 @@ static void fastcall free_hot_cold_page(
100852         struct per_cpu_pages *pcp;
100853         unsigned long flags;
100854  
100855 +       if (arch_free_page(page, 0))
100856 +               return;
100857 +
100858         if (PageAnon(page))
100859                 page->mapping = NULL;
100860         if (free_pages_check(page))
100861 diff -ruNp linux-2.6.19/net/core/dev.c linux-2.6.19-xen-3.0.4/net/core/dev.c
100862 --- linux-2.6.19/net/core/dev.c 2006-11-29 21:57:37.000000000 +0000
100863 +++ linux-2.6.19-xen-3.0.4/net/core/dev.c       2007-02-02 19:11:01.000000000 +0000
100864 @@ -118,6 +118,12 @@
100865  #include <linux/err.h>
100866  #include <linux/ctype.h>
100867  
100868 +#ifdef CONFIG_XEN
100869 +#include <net/ip.h>
100870 +#include <linux/tcp.h>
100871 +#include <linux/udp.h>
100872 +#endif
100873 +
100874  /*
100875   *     The list of packet types we will receive (as opposed to discard)
100876   *     and the routines to invoke.
100877 @@ -1393,6 +1399,43 @@ out_kfree_skb:
100878         }                                               \
100879  }
100880  
100881 +#ifdef CONFIG_XEN
100882 +inline int skb_checksum_setup(struct sk_buff *skb)
100883 +{
100884 +       if (skb->proto_csum_blank) {
100885 +               if (skb->protocol != htons(ETH_P_IP))
100886 +                       goto out;
100887 +               skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
100888 +               if (skb->h.raw >= skb->tail)
100889 +                       goto out;
100890 +               switch (skb->nh.iph->protocol) {
100891 +               case IPPROTO_TCP:
100892 +                       skb->csum = offsetof(struct tcphdr, check);
100893 +                       break;
100894 +               case IPPROTO_UDP:
100895 +                       skb->csum = offsetof(struct udphdr, check);
100896 +                       break;
100897 +               default:
100898 +                       if (net_ratelimit())
100899 +                               printk(KERN_ERR "Attempting to checksum a non-"
100900 +                                      "TCP/UDP packet, dropping a protocol"
100901 +                                      " %d packet", skb->nh.iph->protocol);
100902 +                       goto out;
100903 +               }
100904 +               if ((skb->h.raw + skb->csum + 2) > skb->tail)
100905 +                       goto out;
100906 +               skb->ip_summed = CHECKSUM_PARTIAL;
100907 +               skb->proto_csum_blank = 0;
100908 +       }
100909 +       return 0;
100910 +out:
100911 +       return -EPROTO;
100912 +}
100913 +#else
100914 +inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
100915 +#endif
100916 +
100917 +
100918  /**
100919   *     dev_queue_xmit - transmit a buffer
100920   *     @skb: buffer to transmit
100921 @@ -1425,6 +1468,12 @@ int dev_queue_xmit(struct sk_buff *skb)
100922         struct Qdisc *q;
100923         int rc = -ENOMEM;
100924  
100925 +       /* If a checksum-deferred packet is forwarded to a device that needs a
100926 +        * checksum, correct the pointers and force checksumming.
100927 +        */
100928 +       if (skb_checksum_setup(skb))
100929 +               goto out_kfree_skb;
100930 +
100931         /* GSO will handle the following emulations directly. */
100932         if (netif_needs_gso(dev, skb))
100933                 goto gso;
100934 @@ -1800,6 +1849,19 @@ int netif_receive_skb(struct sk_buff *sk
100935         }
100936  #endif
100937  
100938 +#ifdef CONFIG_XEN
100939 +       switch (skb->ip_summed) {
100940 +       case CHECKSUM_UNNECESSARY:
100941 +               skb->proto_data_valid = 1;
100942 +               break;
100943 +       case CHECKSUM_PARTIAL:
100944 +               /* XXX Implement me. */
100945 +       default:
100946 +               skb->proto_data_valid = 0;
100947 +               break;
100948 +       }
100949 +#endif
100950 +
100951         list_for_each_entry_rcu(ptype, &ptype_all, list) {
100952                 if (!ptype->dev || ptype->dev == skb->dev) {
100953                         if (pt_prev) 
100954 @@ -3586,6 +3648,7 @@ EXPORT_SYMBOL(unregister_netdevice_notif
100955  EXPORT_SYMBOL(net_enable_timestamp);
100956  EXPORT_SYMBOL(net_disable_timestamp);
100957  EXPORT_SYMBOL(dev_get_flags);
100958 +EXPORT_SYMBOL(skb_checksum_setup);
100959  
100960  #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
100961  EXPORT_SYMBOL(br_handle_frame_hook);
100962 diff -ruNp linux-2.6.19/net/core/skbuff.c linux-2.6.19-xen-3.0.4/net/core/skbuff.c
100963 --- linux-2.6.19/net/core/skbuff.c      2006-11-29 21:57:37.000000000 +0000
100964 +++ linux-2.6.19-xen-3.0.4/net/core/skbuff.c    2007-02-02 19:11:01.000000000 +0000
100965 @@ -139,6 +139,7 @@ EXPORT_SYMBOL(skb_truesize_bug);
100966   *     Buffers may only be allocated from interrupts using a @gfp_mask of
100967   *     %GFP_ATOMIC.
100968   */
100969 +#ifndef CONFIG_HAVE_ARCH_ALLOC_SKB
100970  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
100971                             int fclone)
100972  {
100973 @@ -194,6 +195,7 @@ nodata:
100974         skb = NULL;
100975         goto out;
100976  }
100977 +#endif /* !CONFIG_HAVE_ARCH_ALLOC_SKB */
100978  
100979  /**
100980   *     alloc_skb_from_cache    -       allocate a network buffer
100981 @@ -211,14 +213,17 @@ nodata:
100982   */
100983  struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
100984                                      unsigned int size,
100985 -                                    gfp_t gfp_mask)
100986 +                                    gfp_t gfp_mask,
100987 +                                    int fclone)
100988  {
100989 +       kmem_cache_t *cache;
100990         struct sk_buff *skb;
100991         u8 *data;
100992  
100993 +       cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
100994 +
100995         /* Get the HEAD */
100996 -       skb = kmem_cache_alloc(skbuff_head_cache,
100997 -                              gfp_mask & ~__GFP_DMA);
100998 +       skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
100999         if (!skb)
101000                 goto out;
101001  
101002 @@ -242,10 +247,20 @@ struct sk_buff *alloc_skb_from_cache(kme
101003         skb_shinfo(skb)->gso_segs = 0;
101004         skb_shinfo(skb)->gso_type = 0;
101005         skb_shinfo(skb)->frag_list = NULL;
101006 +
101007 +       if (fclone) {
101008 +               struct sk_buff *child = skb + 1;
101009 +               atomic_t *fclone_ref = (atomic_t *) (child + 1);
101010 +
101011 +               skb->fclone = SKB_FCLONE_ORIG;
101012 +               atomic_set(fclone_ref, 1);
101013 +
101014 +               child->fclone = SKB_FCLONE_UNAVAILABLE;
101015 +       }
101016  out:
101017         return skb;
101018  nodata:
101019 -       kmem_cache_free(skbuff_head_cache, skb);
101020 +       kmem_cache_free(cache, skb);
101021         skb = NULL;
101022         goto out;
101023  }
101024 @@ -465,6 +480,10 @@ struct sk_buff *skb_clone(struct sk_buff
101025         C(local_df);
101026         n->cloned = 1;
101027         n->nohdr = 0;
101028 +#ifdef CONFIG_XEN
101029 +       C(proto_data_valid);
101030 +       C(proto_csum_blank);
101031 +#endif
101032         C(pkt_type);
101033         C(ip_summed);
101034         C(priority);
101035 @@ -860,12 +879,12 @@ int ___pskb_trim(struct sk_buff *skb, un
101036         int offset = skb_headlen(skb);
101037         int nfrags = skb_shinfo(skb)->nr_frags;
101038         int i;
101039 +
101040         int err;
101041  
101042         if (skb_cloned(skb) &&
101043             unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
101044                 return err;
101045 -
101046         i = 0;
101047         if (offset >= len)
101048                 goto drop_pages;
101049 diff -ruNp linux-2.6.19/net/ipv4/netfilter/ip_nat_proto_tcp.c linux-2.6.19-xen-3.0.4/net/ipv4/netfilter/ip_nat_proto_tcp.c
101050 --- linux-2.6.19/net/ipv4/netfilter/ip_nat_proto_tcp.c  2006-11-29 21:57:37.000000000 +0000
101051 +++ linux-2.6.19-xen-3.0.4/net/ipv4/netfilter/ip_nat_proto_tcp.c        2007-02-02 19:11:02.000000000 +0000
101052 @@ -129,9 +129,16 @@ tcp_manip_pkt(struct sk_buff **pskb,
101053         if (hdrsize < sizeof(*hdr))
101054                 return 1;
101055  
101056 +#ifdef CONFIG_XEN
101057 +       if ((*pskb)->proto_csum_blank)
101058 +               hdr->check = ~nf_csum_update(~oldip, newip, hdr->check);
101059 +       else
101060 +#endif
101061 +       {
101062         hdr->check = nf_proto_csum_update(*pskb, ~oldip, newip, hdr->check, 1);
101063         hdr->check = nf_proto_csum_update(*pskb, oldport ^ htons(0xFFFF), newport,
101064                                           hdr->check, 0);
101065 +       }
101066         return 1;
101067  }
101068  
101069 diff -ruNp linux-2.6.19/net/ipv4/netfilter/ip_nat_proto_udp.c linux-2.6.19-xen-3.0.4/net/ipv4/netfilter/ip_nat_proto_udp.c
101070 --- linux-2.6.19/net/ipv4/netfilter/ip_nat_proto_udp.c  2006-11-29 21:57:37.000000000 +0000
101071 +++ linux-2.6.19-xen-3.0.4/net/ipv4/netfilter/ip_nat_proto_udp.c        2007-02-02 19:11:02.000000000 +0000
101072 @@ -115,11 +115,18 @@ udp_manip_pkt(struct sk_buff **pskb,
101073         }
101074  
101075         if (hdr->check || (*pskb)->ip_summed == CHECKSUM_PARTIAL) {
101076 +#ifdef CONFIG_XEN
101077 +               if ((*pskb)->proto_csum_blank)
101078 +                       hdr->check = ~nf_csum_update(~oldip, newip, hdr->check);
101079 +               else
101080 +#endif
101081 +               {
101082                 hdr->check = nf_proto_csum_update(*pskb, ~oldip, newip,
101083                                                   hdr->check, 1);
101084                 hdr->check = nf_proto_csum_update(*pskb,
101085                                                   *portptr ^ htons(0xFFFF), newport,
101086                                                   hdr->check, 0);
101087 +               }
101088                 if (!hdr->check)
101089                         hdr->check = -1;
101090         }
101091 diff -ruNp linux-2.6.19/net/ipv4/xfrm4_output.c linux-2.6.19-xen-3.0.4/net/ipv4/xfrm4_output.c
101092 --- linux-2.6.19/net/ipv4/xfrm4_output.c        2006-11-29 21:57:37.000000000 +0000
101093 +++ linux-2.6.19-xen-3.0.4/net/ipv4/xfrm4_output.c      2007-02-02 19:11:02.000000000 +0000
101094 @@ -18,6 +18,8 @@
101095  #include <net/xfrm.h>
101096  #include <net/icmp.h>
101097  
101098 +extern int skb_checksum_setup(struct sk_buff *skb);
101099 +
101100  static int xfrm4_tunnel_check_size(struct sk_buff *skb)
101101  {
101102         int mtu, ret = 0;
101103 @@ -48,6 +50,10 @@ static int xfrm4_output_one(struct sk_bu
101104         struct xfrm_state *x = dst->xfrm;
101105         int err;
101106         
101107 +       err = skb_checksum_setup(skb);
101108 +       if (err)
101109 +               goto error_nolock;
101110 +
101111         if (skb->ip_summed == CHECKSUM_PARTIAL) {
101112                 err = skb_checksum_help(skb);
101113                 if (err)
101114 diff -ruNp linux-2.6.19/net/ipv6/addrconf.c linux-2.6.19-xen-3.0.4/net/ipv6/addrconf.c
101115 --- linux-2.6.19/net/ipv6/addrconf.c    2007-02-02 20:26:43.000000000 +0000
101116 +++ linux-2.6.19-xen-3.0.4/net/ipv6/addrconf.c  2007-02-02 19:11:02.000000000 +0000
101117 @@ -2545,6 +2545,8 @@ static void addrconf_dad_start(struct in
101118         spin_lock_bh(&ifp->lock);
101119  
101120         if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
101121 +
101122 +           !(dev->flags&IFF_MULTICAST) ||
101123             !(ifp->flags&IFA_F_TENTATIVE) ||
101124             ifp->flags & IFA_F_NODAD) {
101125                 ifp->flags &= ~IFA_F_TENTATIVE;
101126 @@ -2630,6 +2632,7 @@ static void addrconf_dad_completed(struc
101127         if (ifp->idev->cnf.forwarding == 0 &&
101128             ifp->idev->cnf.rtr_solicits > 0 &&
101129             (dev->flags&IFF_LOOPBACK) == 0 &&
101130 +           (dev->flags & IFF_MULTICAST) &&
101131             (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
101132                 struct in6_addr all_routers;
101133  
101134 diff -ruNp linux-2.6.19/scripts/Makefile.xen linux-2.6.19-xen-3.0.4/scripts/Makefile.xen
101135 --- linux-2.6.19/scripts/Makefile.xen   1970-01-01 00:00:00.000000000 +0000
101136 +++ linux-2.6.19-xen-3.0.4/scripts/Makefile.xen 2007-02-02 19:11:04.000000000 +0000
101137 @@ -0,0 +1,14 @@
101138 +
101139 +# cherrypickxen($1 = allobj)
101140 +cherrypickxen = $(foreach var, $(1), \
101141 +               $(shell o=$(var); \
101142 +                       c=$${o%.o}-xen.c; \
101143 +                       s=$${o%.o}-xen.S; \
101144 +                       oxen=$${o%.o}-xen.o; \
101145 +                       [ -f $(srctree)/$(src)/$${c} ] || \
101146 +                          [ -f $(srctree)/$(src)/$${s} ] \
101147 +                               && echo $$oxen \
101148 +                               || echo $(var) ) \
101149 +         )
101150 +# filterxen($1 = allobj, $2 = noobjs)
101151 +filterxen = $(filter-out $(2), $(1))
This page took 7.616735 seconds and 3 git commands to generate.