/* We want our logfiles to show which binpatches are applied.  At the
   same time, we don't really want to make it obvious what those
   patches do, or even that we're applying binpatches at all.  As
   such, we have a couple of FEATURE_ codes, and the logs just show
   which numbered features have been applied. */

#include <ntddk.h>
#include <wdmsec.h>
#include <ntimage.h>
#include "xsapi.h"
#include "hvm.h"
#include "hypercall.h"
#include "xenhdrs/memory.h"
#include "xenhdrs/sched.h"
#include "xenevtchn.h"
#include "scsiboot.h"

//#include "evtchn.h"

#define BINPATCH_TAG 'NIBX'
#define DRIVER_NAME L"XenPatch"

// N.B. Not a valid GUID, it's tenkei2's plus 1.
#include <initguid.h>
DEFINE_GUID (GUID_DEVCLASS_XENPATCH,
        0xc87b189a, 0xb648, 0x4b80, 0xb4, 0xb6, 0x26, 0x9e, 0x73, 0xb2, 0x56, 0x4d);


KIRQL QuiesceSystem(void);
void UnquiesceSystem(KIRQL);

//
// http://undocumented.ntinternals.net
//
    
#define SystemModuleInformation 11
    
typedef struct _SYSTEM_MODULE {

    ULONG Reserved1; 
    ULONG Reserved2; 
    PVOID ImageBaseAddress; 
    ULONG ImageSize; 
    ULONG Flags; 
    SHORT Id; 
    SHORT Rank; 
    SHORT w018; 
    SHORT NameOffset; 
    CHAR  Name[256];

} SYSTEM_MODULE, *PSYSTEM_MODULE;

/* Allow zero-size arrays */
#pragma warning (disable:4200)

typedef struct _SYSTEM_MODULE_INFORMATION {

    ULONG           ModulesCount; 
    SYSTEM_MODULE   Modules[0];

} SYSTEM_MODULE_INFORMATION, *PSYSTEM_MODULE_INFORMATION;
#pragma warning (default:4200)

NTSTATUS 
NTAPI 
ZwQuerySystemInformation(
    IN  ULONG   SystemInformationClass, // SYSTEM_INFORMATION_CLASS 
    IN  PVOID   SystemInformation,
    IN  ULONG   systemInformationLength,
    IN  PULONG  ReturnLength
    );

#define LdrMkVA(x,y) (PVOID)((ULONG_PTR)(x) + (ULONG_PTR)(y))

#define LdrGetDosHeader(x) ((PIMAGE_DOS_HEADER)(x))
#define LdrGetFileHeader(x) ((PIMAGE_FILE_HEADER)((ULONG_PTR)(x) + (ULONG)(LdrGetDosHeader(x)->e_lfanew) + sizeof(ULONG)))
#define LdrGetOptionalHeader(x) ((PIMAGE_OPTIONAL_HEADER)(((ULONG_PTR)LdrGetFileHeader(x) + sizeof(IMAGE_FILE_HEADER))))

/* Allow typecasts between PVOID and function pointers */
#pragma warning(disable : 4054)

#define FEATURE_APIC_PATCH 4
#define FEATURE_BSOD_TEXT_PATCH 5

////////////////////////////////////////////////////////////////////////////////
//  PATCH DEFINES
//

typedef int (NTAPI * VID_DISPLAY_STRING)(IN PCHAR);

////////////////////////////////////////////////////////////////////////////////
//  PATCH GLOBALS
//

static ULONG        _g_BinPatchPhaseComplete    = (ULONG)-1;
static ULONGLONG    _g_BinPatchFeatureInstalled = 0;
static RTL_OSVERSIONINFOEXW osInfo;
static enum {
            osUnknown,
            osW2k,
            osXP,
            osW2k3,
            osVista,
            osW2k8,
            osMAX
       } osType = osUnknown;

static ULONG _g_BinPatchInit    = FALSE;

static VID_DISPLAY_STRING _g_SavedVidDisplayString = NULL;
 
//
//  AA    PPP   IIIII   CCC
// A  A   P  P    I    C
// AAAA   PPP     I    C            Structures and instruction patching
// A  A   P       I    C            definitions.
// A  A   P     IIIII   CCC
//
// Temporary APIC structure definition as the only things we really care about
// are the TPR (at offset 0x80) and the IRR at offset 0x200.  We should include
// a definition from elsewhere in Xen when/if we expand it to use other fields.
//

typedef struct
{
    UCHAR   fill0[0x80];
    ULONG   TPR;
    UCHAR   fill1[0x200-0x84];
    ULONG   IRR[8];
} APIC_PAGE, *PAPIC_PAGE;

static int _g_ApicAltMovCR8AMD = FALSE;

static PAPIC_PAGE _g_Apic[MAXIMUM_PROCESSORS];
static PHYSICAL_ADDRESS _g_ApicPhysAddr[MAXIMUM_PROCESSORS];

// TPR_CODE_HANDLE sequences are replaced with calls to routines which
// perform the equivalent functions in an enlightned fashion.

#define TPR_CODE_HANDLE \
    TPR(store_const_0x3d, "\xc7\x05\x80\x00\xfe\xff\x3d\x00\x00\x00") /* mov dword ptr ds:[0FFFE0080h],03Dh */ \
    TPR(store_const_0xd1, "\xc7\x05\x80\x00\xfe\xff\xd1\x00\x00\x00") /* mov dword ptr ds:[0FFFE0080h],0D1h */ \
    TPR(store_const_0xc1, "\xc7\x05\x80\x00\xfe\xff\xc1\x00\x00\x00") /* mov dword ptr ds:[0FFFE0080h],0C1h */ \
    TPR(store_const_0x41, "\xc7\x05\x80\x00\xfe\xff\x41\x00\x00\x00") /* mov dword ptr ds:[0FFFE0080h],41h */ \
    TPR(store_ecx,        "\x89\x0d\x80\x00\xfe\xff") /* mov [0FFFE0080h],ecx */ \
    TPR(store_esi,        "\x89\x35\x80\x00\xfe\xff") /* mov [0FFFE0080h],esi */ \
    TPR(store_eax,        "\xa3\x80\x00\xfe\xff")     /* mov [0FFFE0080h],eax */ \
    TPR(load_eax,         "\xa1\x80\x00\xfe\xff")     /* mov eax,[0FFFE0080h] */ \
    TPR(load_ecx,         "\x8b\x0d\x80\x00\xfe\xff") /* mov ecx,[0FFFE0080h] */ \
    TPR(load_edx,         "\x8b\x15\x80\x00\xfe\xff") /* mov edx,[0FFFE0080h] */ \
    TPR(load_esi,         "\x8b\x35\x80\x00\xfe\xff") /* mov esi,[0FFFE0080h] */ \

// TPR_CODE_REMOVE sequences are replaced by nops (actually the first two
// bytes are replaced with an unconditional branch over the sequence, the
// remainder is nop'd.

#define TPR_CODE_REMOVE \
    TPR(test_icr_eax,     "\x85\x05\x00\x03\xfe\xff\x75\xf8") /* test [0FFFE0300h],eax ; jne test */ \
    TPR(test_icr_ebx,     "\x85\x1d\x00\x03\xfe\xff\x75\xf8") /* test [0FFFE0300h],ebx ; jne test */ \
    TPR(test_icr_ecx,     "\x85\x0d\x00\x03\xfe\xff\x75\xf8") /* test [0FFFE0300h],ecx ; jne test */ \
    TPR(test_icr_0x1000,  "\xf7\x05\x00\x03\xfe\xff\x00\x10\x00\x00\x75\xf4") /* test [0FFFE0080h], 0x1000 ; jne test */ \
    TPR(bt_irr_edx_eax,   "\x0f\xa3\x02\x73\xfb")             /* bt [edx],eax ; jnc bt */ \

#define TPR_CODE_LIST TPR_CODE_HANDLE TPR_CODE_REMOVE

//
// Generate a table of the TPR access instructions we handle and the 
// length of those instructions (as we will need to no-op out that part
// of the instuction not overwritten by the call into our handlers.
//

typedef struct 
{
    PCHAR   Code;
    ULONG   Len;
} TPR_CODE, *PTPR_CODE;

#define TPR(x,y) { y, (ULONG)sizeof(y) - 1 },
static TPR_CODE TprCode[] =
{
    TPR_CODE_LIST
    { "", 0 }
};
#undef TPR

static int
is_tpr_access(const void *addr, ULONG_PTR start, ULONG_PTR end,
              int index)
{
    TPR_CODE *tpr = &TprCode[index];
    if ((ULONG_PTR)addr < start ||
        (ULONG_PTR)addr + tpr->Len >= end) {
        TraceWarning(("TPR patch at %p is out of range[%p,%p)\n",
                      addr, start, end));
        return 0;
    }
    if (RtlCompareMemory(addr, tpr->Code, tpr->Len) != tpr->Len)
        return 0;
    else
        return 1;
}

#define IS_TPR_ACCESS(a, x) \
is_tpr_access((a), halBase, halEnd, TPR_##x)

//
// Create an enumeration of the TPR instructions we handle.
//

#define TPR(x,y) TPR_##x,
typedef enum 
{
    TPR_CODE_LIST
    TPR_CODE_MAX
} TPR_CODE_INDEX;
#undef TPR

//
// Table of entry points for the TPR access routines.  These are the routines
// we patch the original TPR access instructions to branch into.
//

#define TPR(x,y) extern UCHAR tpr_##x;
TPR_CODE_HANDLE
#undef TPR

#define TPR(x,y) &tpr_##x,
static PUCHAR TprHandlerEntry[] =
{
    TPR_CODE_HANDLE
    NULL
};
#undef TPR

#define TPR_HANDLE_MAX ((sizeof(TprHandlerEntry)/sizeof(TprHandlerEntry[0]))-1)

//
//  AA    PPP   IIIII   CCC
// A  A   P  P    I    C
// AAAA   PPP     I    C            End.
// A  A   P       I    C
// A  A   P     IIIII   CCC
//


////////////////////////////////////////////////////////////////////////////////
//  PATCH SUPPORT ROUTINES
//

NTSTATUS
GetModuleExportAddress(
    IN  ULONG_PTR       ImageBase,
    IN  PCHAR           RoutineName,
    OUT PVOID*          pRoutineAddress
)
{
    NTSTATUS                ntStatus;
    PIMAGE_EXPORT_DIRECTORY pexports;
    PULONG                  peat;
    
    ULONG                  *pname;
    USHORT                 *pordinal;
    
    ULONG                   i;
    ULONG                   high;
    ULONG                   low;
    LONG                    diff;

    ASSERT ((ImageBase & PAGE_SIZE-1) == 0);

    if( ImageBase == 0 || RoutineName == NULL || pRoutineAddress == NULL )
    {
        ntStatus = STATUS_INVALID_PARAMETER;
        goto _Exit;   
    }
    
    pexports= (PIMAGE_EXPORT_DIRECTORY)(LdrGetOptionalHeader(ImageBase)->DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress);
    pexports= (PIMAGE_EXPORT_DIRECTORY) LdrMkVA(ImageBase, pexports);
    peat    = (PULONG) pexports->AddressOfFunctions;
    peat    = (PULONG) LdrMkVA(ImageBase, peat);
    
    pname   = (PULONG) pexports->AddressOfNames;
    pname   = (PULONG) LdrMkVA(ImageBase, pname);
    pordinal= (PUSHORT) pexports->AddressOfNameOrdinals;
    pordinal= (PUSHORT) LdrMkVA(ImageBase, pordinal);
    
    high    = pexports->NumberOfNames - 1;
    low     = 0;

    for (;;) 
    {
        i = (high + low) / 2;

        diff = strcmp ((const char *)LdrMkVA(ImageBase, pname[i]), RoutineName);
        
        if (diff == 0 || high <= low)
        {
            break;
        }

        if (diff > 0) 
        {
            high = (i > 1) ? (i - 1) : 0;
        }
        else
        {
            low = i + 1;
        }
    }
    
    if (diff != 0) 
    {
        TraceWarning (("failed to find export \"%s\"\n", RoutineName));
        ntStatus = STATUS_UNSUCCESSFUL;
        goto _Exit;
    }
    
    *pRoutineAddress = (PVOID)(ImageBase + peat[pordinal[i]]);
    ntStatus = STATUS_SUCCESS;

_Exit:
      
    return ntStatus;
}


static NTSTATUS
GetModuleBaseAddress(
    IN  PCHAR           ImageBaseName OPTIONAL,
    OUT PULONG_PTR      pImageBase OPTIONAL,
    OUT PULONG          pImageSize OPTIONAL
)
{
    NTSTATUS                    ntStatus;
    PSYSTEM_MODULE_INFORMATION  pBuffer;
    PSYSTEM_MODULE              pMod;
    ULONG                       i;
    ULONG                       count;
    ULONG                       required;
    LONG                        diff;
    
    
    pBuffer  = NULL;

    ntStatus = ZwQuerySystemInformation ( 
                    SystemModuleInformation,
                    NULL,
                    0,
                    &required );
                    
    while (ntStatus == STATUS_INFO_LENGTH_MISMATCH)
    {
        if (pBuffer != NULL) 
        {
            ExFreePool (pBuffer);
        }
        
        pBuffer = ExAllocatePoolWithTag (PagedPool, 
                                        required + sizeof(SYSTEM_MODULE) * 2,
                                        BINPATCH_TAG);  
        
        if(pBuffer == NULL)
        {
            TraceError (("GetModuleBaseAddress: Failure to allocate buffer\n"));
            ntStatus = STATUS_INSUFFICIENT_RESOURCES;
            goto _Cleanup;    
        }
                                                                        
        ntStatus = ZwQuerySystemInformation ( 
                        SystemModuleInformation,
                        pBuffer,
                        required + sizeof(SYSTEM_MODULE) * 2,
                        &required );
    }

    if (! NT_SUCCESS(ntStatus))
    {    
        goto _Cleanup;
    }
    
    //
    // PreFAST doesnt like the goto but we are safe here, make sure.
    //
    
    ASSERT (pBuffer != NULL);
    __assume (pBuffer != NULL);
    
    ntStatus= STATUS_NOT_FOUND;
    count   = pBuffer->ModulesCount;
    pMod    = &pBuffer->Modules[0];

    for (i = 0; i < count; i++) {
        TraceInfo(("%s loaded at [%p,%p)\n",
                   &pMod[i].Name[pMod[i].NameOffset],
                   pMod[i].ImageBaseAddress,
                   (ULONG_PTR)pMod[i].ImageBaseAddress +pMod[i].ImageSize));
    }

    if (ImageBaseName) {
        for (i = 0; i < count; i++)
        {
            diff = _stricmp (&pMod[i].Name[pMod[i].NameOffset], ImageBaseName); 
            
            if (diff == 0)
            {
                ntStatus    = STATUS_SUCCESS;
                *pImageBase = (ULONG_PTR)pMod[i].ImageBaseAddress;
                if (pImageSize != NULL)
                {
                    *pImageSize = pMod[i].ImageSize;
                }
                
                break;
            }
        }
    }

_Cleanup:

    if (pBuffer != NULL)
    {
        ExFreePool (pBuffer);
    }

    return ntStatus;
}

//
// We assume here that the first 2 modules loaded in the kernel were the 
// kernel and the hal. we cant use GetModuleBaseAddress since we dont know
// which version we are running ie. ntoskrnl.exe, ntklpamp.exe, halacpi.dll etc.
// This as proven to be safe assumption up until at least 2k3 for this API
//

NTSTATUS
GetSystemBaseAddress(
    OUT PULONG_PTR      KernelBase, OPTIONAL
    OUT PULONG          KernelSize, OPTIONAL
    OUT PULONG_PTR      HalBase,    OPTIONAL
    OUT PULONG          HalSize     OPTIONAL
)
{
    NTSTATUS                    ntStatus;
    PSYSTEM_MODULE_INFORMATION  pBuffer;
    PSYSTEM_MODULE              pMod;
    ULONG                       required;

    /* XXX After we restore from hibernate, we have to redo the binary
       patching from a DPC.  We can't call ZwQuerySystemInformation
       from there, and so the workaround is to cache the results the
       first time we're called and just return the same things on
       subsequent invocations, on the assumption that hal.dll and
       ntoskrnl.exe won't move without rebooting. */
    static ULONG_PTR KBaseCache, HBaseCache;
    static ULONG KSizeCache, HSizeCache;
    static BOOLEAN HaveCache;

    if (HaveCache) {
        if (KernelBase)
            *KernelBase = KBaseCache;
        if (KernelSize)
            *KernelSize = KSizeCache;
        if (HalBase)
            *HalBase = HBaseCache;
        if (HalSize)
            *HalSize = HSizeCache;
        return STATUS_SUCCESS;
    }

    pBuffer = ExAllocatePoolWithTag (PagedPool,
                                     sizeof(SYSTEM_MODULE) * 2 + sizeof(ULONG),
                                     BINPATCH_TAG);

    if(pBuffer == NULL)
    {
        TraceError (("GetModuleBaseAddress: Failure to allocate buffer\n"));
        ntStatus = STATUS_INSUFFICIENT_RESOURCES;
        goto _Cleanup;    
    }
                                                                    
    ntStatus = ZwQuerySystemInformation ( 
                    SystemModuleInformation,
                    pBuffer,
                    sizeof(SYSTEM_MODULE) * 2 + sizeof(ULONG),
                    &required );
                                        
    //
    // PreFAST doesnt like the goto but we are safe here, make sure.
    //
    
    ASSERT (pBuffer != NULL);
    __assume (pBuffer != NULL);
    
    pMod    = &pBuffer->Modules[0];
    
    if (! NT_SUCCESS(ntStatus) && ntStatus != STATUS_INFO_LENGTH_MISMATCH)
    {    
        TraceError (("GetModuleBaseAddress: Failure w/ ZwQuerySystemInformation"
                     " ntStatus = 0x%08x\n", ntStatus));
        goto _Cleanup;
    }
    

    KBaseCache = (ULONG_PTR)pMod[0].ImageBaseAddress;
    KSizeCache = (ULONG_PTR)pMod[0].ImageSize;
    HBaseCache = (ULONG_PTR)pMod[1].ImageBaseAddress;
    HSizeCache = (ULONG_PTR)pMod[1].ImageSize;
    HaveCache  = TRUE;
    if (KernelBase != NULL) *KernelBase = KBaseCache;
    if (KernelSize != NULL) *KernelSize = KSizeCache;
    if (HalBase != NULL)    *HalBase = HBaseCache;
    if (HalSize != NULL)    *HalSize = HSizeCache;
    ntStatus    = STATUS_SUCCESS;

_Cleanup:

    if (pBuffer != NULL)
    {
        ExFreePool (pBuffer);
    }

    return ntStatus;
}


#if 0
int
_VidDisplayStringHook(
    IN  PCHAR   AsciiString
)
{
    TraceNotice(("%s", AsciiString));
    
    return _g_SavedVidDisplayString (AsciiString);    
}


static NTSTATUS
_HookModuleIATEntry(
    IN  ULONG_PTR       ImportImageBase,
    IN  ULONG_PTR       ExportImageBase,
    IN  PCHAR           RoutineName,
    IN  PVOID           HookAddress,
    OUT PVOID*          pOriginalAddress
)
{
    NTSTATUS                    ntStatus;
    PIMAGE_IMPORT_DESCRIPTOR    pimports;
    PULONG_PTR                  pIAT;
    
    BOOLEAN                     bFound;
    
    PMDL                        pMdl;
    PULONG_PTR                  rwptr;
        
    ASSERT ((ImportImageBase & PAGE_SIZE-1) == 0);
    ASSERT ((ExportImageBase & PAGE_SIZE-1) == 0);
    
    ntStatus = STATUS_UNSUCCESSFUL;
    bFound = FALSE;
    pIAT = NULL;
    
    if( ImportImageBase == 0 
     || ExportImageBase == 0
     || RoutineName == NULL 
     || HookAddress == NULL
     || pOriginalAddress == NULL )
    {
        ntStatus = STATUS_INVALID_PARAMETER;
        goto _Exit;   
    }

    ntStatus = GetModuleExportAddress (ExportImageBase, 
                                        RoutineName, 
                                        pOriginalAddress);
    
    if (! NT_SUCCESS(ntStatus))
    {
        TraceError (("Could not find address of %p!%s\n", 
                        ExportImageBase, RoutineName));
        goto _Exit;   
    }

    pimports= (PIMAGE_IMPORT_DESCRIPTOR)(LdrGetOptionalHeader(ImportImageBase)->DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT].VirtualAddress);
    pimports= (PIMAGE_IMPORT_DESCRIPTOR) LdrMkVA(ImportImageBase, pimports);
        
    while (pimports->Characteristics != 0)
    {
        pIAT = LdrMkVA (ImportImageBase, pimports->FirstThunk);
    
        while (*pIAT != 0)
        {
            if (*pIAT  == (ULONG_PTR)*pOriginalAddress)
            {
                bFound = TRUE;
                goto _Done;   
            }
            
            pIAT++;
        }
        
        pimports++;
    }

_Done:
        
    if (bFound == TRUE)
    {
        //
        // Little trick to get a r/w page mapping to what we want to overwrite
        //
        
        pMdl = IoAllocateMdl( pIAT, sizeof(ULONG_PTR), FALSE, FALSE, NULL );
    
        if( pMdl != NULL )
        {
            MmBuildMdlForNonPagedPool( pMdl );
            
            rwptr = MmMapLockedPagesSpecifyCache (
                                                       pMdl,
                                                       KernelMode,
                                                       MmNonCached,
                                                       NULL,
                                                       FALSE,
                                                       NormalPagePriority);
            
            if( rwptr != NULL )
            {
                ASSERT(pIAT != NULL);
                __assume(pIAT != NULL);
                *(PULONG_PTR)pOriginalAddress = *rwptr;
                *(PVOID*)rwptr = HookAddress;
                ntStatus = STATUS_SUCCESS;
                
                MmUnmapLockedPages( rwptr, pMdl );
            }
            
            IoFreeMdl( pMdl );
        }
    }
    else
    {
        *pOriginalAddress = NULL;   
    }
    
_Exit:
      
    return ntStatus;
}


////////////////////////////////////////////////////////////////////////////////
//  PATCH ROUTINES
//


static                                      
NTSTATUS
_PatchBluescreenTextOutput()
{
    NTSTATUS    ntStatus;
    ULONG_PTR   ntosBase;
    ULONG_PTR   bootvidBase;
    
    ntStatus = GetSystemBaseAddress (&ntosBase, NULL, NULL, NULL);
    
    if (! NT_SUCCESS (ntStatus))
    {
        TraceError (("GetSystemBaseAddress failed ntStatus =0x%08x", ntStatus));   
        goto _Cleanup;
    }

    ntStatus = GetModuleBaseAddress ("bootvid.dll", &bootvidBase, NULL);
    
    if (! NT_SUCCESS (ntStatus))
    {
        TraceError (("GetModuleBaseAddress(bootvid.dll) failed ntStatus =0x%08x", 
                        ntStatus));   
        goto _Cleanup;
    }
    
    ntStatus = _HookModuleIATEntry (ntosBase, 
                                    bootvidBase, 
                                    "VidDisplayString", 
                                    (PVOID)_VidDisplayStringHook,
                                    (PVOID*)&_g_SavedVidDisplayString);
                                    
    if (! NT_SUCCESS (ntStatus))
    {
        TraceError (("_HookModuleIATEntry(VidDisplayString) failed ntStatus =0x%08x", ntStatus));   
        goto _Cleanup;
    }
   
_Cleanup:

    return ntStatus;
}
#endif

PVOID
XenevtchnAllocIoMemory(
    ULONG Bytes,
    PHYSICAL_ADDRESS *PA
    )
{
    //
    // TEMP: KLUDGE: XXX: PLJTMP
    //
    // For this subset application, our only caller wants one page, just
    // allocate a page from system memory.  This page will get added to 
    // the physmap, xen will hopefully just release the existing page 
    // (although not really as that would screw up the tk2 allocation).
    //

    PMDL                mdl;
    PHYSICAL_ADDRESS    low;
    PHYSICAL_ADDRESS    high;
    PHYSICAL_ADDRESS    skip;
    PVOID               va;

    if (Bytes != PAGE_SIZE) {
        DbgPrint("XenevtchnAllocIoMemory called with byte count != PAGE_SIZE.\n");
        return NULL;
    }

    low.QuadPart = 0;
    high.QuadPart = ~0;
    skip.QuadPart = 0;

    mdl = MmAllocatePagesForMdl(low, high, skip, Bytes);
    if (!mdl) {
        DbgPrint("XenevtchnAllocIoMemory: failed to allocate Mdl\n");
        return NULL;
    }

    if (mdl->ByteCount != Bytes) {
        DbgPrint("XenevtchnAllocIoMemory: allocate Mdl allocated < 1 page?!?!\n");
        MmFreePagesFromMdl(mdl);
        ExFreePool(mdl);
        return NULL;
    }

    va = MmMapLockedPagesSpecifyCache(mdl, KernelMode, MmCached, NULL, 
                                      FALSE, LowPagePriority);
    if (!va) {
        DbgPrint("XenevtchnAllocIoMemory: unable to map allocated page.\n");
        MmFreePagesFromMdl(mdl);
        ExFreePool(mdl);
        return NULL;
    }

    PA->QuadPart = (ULONGLONG)*MmGetMdlPfnArray(mdl) << PAGE_SHIFT;
    ExFreePool(mdl);
    return va;
}


static int
CheckPatchSite(
    IN ULONG_PTR Base,
    IN ULONG_PTR End,
    IN ULONG Off,
    IN const PVOID Pre,
    IN ULONG Len
)
{
    ULONG_PTR ptr = Base + Off;

    if ( ptr >= End )
        return 0;
    if ( memcmp((PVOID)ptr, Pre, Len) )
        return 0;
    else
        return 1;
}

static void *
MapAbstractLocalApic(unsigned long cpu, PHYSICAL_ADDRESS *physAddr)
{
    int ret;
    PVOID res;

    res = XenevtchnAllocIoMemory(PAGE_SIZE, physAddr);
    if (!res) {
        TraceError (("Failed to allocate IO memory for lapic %d.\n", cpu));
        return NULL;
    }
    ret = AddPageToPhysmap((unsigned long)(physAddr->QuadPart >> PAGE_SHIFT),
                           XENMAPSPACE_rw_local_apic,
                           cpu);
    if (ret != 0)
    {
        ret = AddPageToPhysmap((unsigned long)(physAddr->QuadPart >> PAGE_SHIFT),
                               XENMAPSPACE_rw_local_apic_compat,
                               cpu);
        if (ret != 0) {
            TraceNotice (("Failed to add vlapic page for vcpu %d to physmap: %d.\n",
                          cpu, ret));
            return NULL;
        }
        TraceWarning(("Using old-style vlapic mapping.\n"));
    }
    TraceInfo (("APIC %d mapped at %p (%x:%x)\n",
                cpu, res, physAddr->QuadPart));
    return res;
}


#define MAX_APIC_PATCH 80
static struct
{
    PCHAR       Base;
    LONG        Type;
} apic_patches[MAX_APIC_PATCH];
static LONG
nr_apic_patches_applied;

/* Get a writable mapping to the sites in the apic_patches list.
   Original VA x gets mapped at x + *remapAdjust.  When you're done
   with the mapping, call MmUnmapIoSpace(return_value, *size).  We
   assume that the sites are in physically contiguous memory. */
static PVOID
MapApicPatchSites(PULONG size, PULONG remapAdjust)
{
    PCHAR lowest, highest, tmp;
    int i;
    PHYSICAL_ADDRESS physicalBase;
    PCHAR newBase;

    lowest  = apic_patches[0].Base;
    highest = lowest + TprCode[apic_patches[0].Type].Len;
    for (i = 1; i < nr_apic_patches_applied; i++)
    {
        if (apic_patches[i].Base < lowest)
        {
            lowest = apic_patches[i].Base;
        }
        else
        {
            tmp =
                apic_patches[i].Base + TprCode[apic_patches[i].Type].Len;
            if (tmp > highest)
                highest = tmp;
        }
    }

    physicalBase = MmGetPhysicalAddress(lowest);
    newBase = MmMapIoSpace(physicalBase, highest - lowest, MmCached);
    if (newBase == NULL)
        return NULL;

    *remapAdjust = newBase - lowest;
    *size = highest - lowest;

    return newBase;
}

/* This doesn't actually revert the patches any more, since that's too
   hard (e.g. if some thread is currently suspended in a patch
   replacement).  Instead, we set the apic page pointers to NULL,
   which causes the patch handlers to go to a slow path which is safe
   during hibernation, which is the only place this gets called
   from. */
void
UnpatchAPIC(VOID)
{
    ULONG i;
    KIRQL irql;
    ULONG cpucount;

    cpucount = KeNumberProcessors;
    irql = QuiesceSystem();
    for (i = 0; i < cpucount; i++)
        _g_Apic[i] = NULL;
    UnquiesceSystem(irql);

    TraceNotice (("Suppressed feature %d all variants\n", FEATURE_APIC_PATCH));
}

static int
MapApicPages(VOID)
{
    LONG cpu;
    int res = 1;

    for (cpu = 0; cpu < KeNumberProcessors; cpu++)
    {
        if (_g_Apic[cpu] == NULL)
            _g_Apic[cpu] = MapAbstractLocalApic(cpu, &_g_ApicPhysAddr[cpu]);
        if ( _g_Apic[cpu] == NULL ) {
            /* Try to continue even if some pages fail.  We may be
               resuming from hibernate, in which case some of the
               patches are already in and will be slow path until the
               pages get mapped.  Don't try to apply any more patches,
               though. */
            TraceNotice (("Failed to map PV APIC page %d.\n", cpu));
            res = 0;
        }
    }
    return res;
}

/* This is invoked directly from suspend.c::DoSuspend, rather than via
   the usual suspend recovery handler mechanism, because it's so
   special. */
void
ApicRecoverFromSuspend(void)
{
    LONG cpu;
    PHYSICAL_ADDRESS physAddr;
    int failed = -1;

    for (cpu = 0; cpu < KeNumberProcessors; cpu++) {
        if (_g_Apic[cpu]) {
            physAddr = _g_ApicPhysAddr[cpu];
            if (AddPageToPhysmap((unsigned long)(physAddr.QuadPart >>
                                                 PAGE_SHIFT),
                                 XENMAPSPACE_rw_local_apic,
                                 cpu) != 0 &&
                AddPageToPhysmap((unsigned long)(physAddr.QuadPart >>
                                                 PAGE_SHIFT),
                                 XENMAPSPACE_rw_local_apic_compat,
                                 cpu) != 0) {
                _g_Apic[cpu] = NULL;
                failed = cpu;
            }
        }
    }

    if (failed != -1)
        TraceError(("Failed to remap lapic page %d?\n", failed));

    for (cpu = 0; cpu < KeNumberProcessors; cpu++) {
        TraceInfo(("Remapped apic %d to %p.\n", cpu, _g_Apic[cpu]));
    }
}

static int
_PatchApic()
{
    NTSTATUS        status;
    ULONG_PTR       halBase;
    ULONG_PTR       halEnd;
    ULONG           halSize;
    PCHAR           halKeGetCurrentIrql;
    LONG            tprHal = FALSE;
    KIRQL           oldIrql;
    PCHAR           address;
    PCHAR           newBase;
    ULONG           remapAdjust;
    ULONG           tmpOffset;
    LONG            i;
    BOOLEAN         mp;
    ULONG           size;
    KSPIN_LOCK      lockTest;

    /* Patches already applied? */
    if (_g_BinPatchFeatureInstalled & DEBUG_PATCH_APIC)
    {
        return 0;
    }

    //
    // This patch differs slightly depending on OS version.
    // Currently we only patch Windows 2000, XP SP2, SP3 and Windows
    // 2003 SP1.
    //
    // Note: Major version for Vista is 6 but we don't have patches for
    // it yet.
    //
    // Note: 2k3 SP2 implements lazy irql in the HAL so we don't need 
    // most of it, however, we can probably help it by removing the 
    // polling of the apic busy bit in the ipi send routines.
    //

    if (osInfo.dwMajorVersion != 5)
    {
        // Windows 2000, XP and 2003 have a major version of 5.
        return 0;
    }

    switch (osType)
    {
    case osW2k:
        break;

    case osXP:
        if (osInfo.wServicePackMajor < 2 ||
            osInfo.wServicePackMajor > 3)   // Service Pack 2, or 3 ok
        {
            return 0;
        }
        break;

    case osW2k3:
        if (osInfo.wServicePackMajor > 2)   // Service pack 0, 1 and 2 ok
        {
            return 0;
        }
        break;

    default:
        return 0;                           // don't know what it is.
    }

    status = GetSystemBaseAddress(NULL, NULL, &halBase, &halSize);
    if (!NT_SUCCESS(status))
    {
        TraceError(("Could not find base of hal.dll\n"));
        return 0;
    }
    halEnd = halBase + halSize;

    //
    // Determine if the kernel/hal are UP or MP by initializing then
    // acquiring a spin lock and seeing if the value of the lock is
    // changed by acquisition.  With UP hals, only the IRQL changes,
    // not the lock.
    //

    KeInitializeSpinLock(&lockTest);
    oldIrql = KfAcquireSpinLock(&lockTest);

    mp = (BOOLEAN)(lockTest != 0);

    KfReleaseSpinLock(&lockTest, oldIrql);

    //
    // APIC Interprocessor Interrupts
    //

    // IPI send routines test APIC.ICR to make sure there isn't already a send
    // pending before proceeding.  With Xen we are guaranteed this bit will
    // always be zero so we avoid the vmexit by not checking it.
    //
    // The lazy irql hal (2k3 sp2) doesn't use the APIC to send interrupts
    // to itself, so we don't attempt to patch what isn't there.
    status = GetModuleExportAddress(halBase,
                                    "HalRequestSoftwareInterrupt",
                                    &address);

    if (!NT_SUCCESS(status))
    {
        TraceError(("Could not find address of hal!HalRequestSoftwareInterrupt\n"));
        return 0;
    }

#define CHECK_PATCH(x,t, y)                                         \
    if (nr_apic_patches_applied < MAX_APIC_PATCH)                   \
    {                                                               \
        if (IS_TPR_ACCESS((x), t))                                  \
        {                                                           \
            apic_patches[nr_apic_patches_applied].Base = (x);       \
            apic_patches[nr_apic_patches_applied].Type = TPR_##t;   \
            nr_apic_patches_applied++;                              \
            TraceNotice (("Feature %d variant %d at %p.\n", FEATURE_APIC_PATCH, (y), (x))); \
        }                                                           \
        else                                                        \
        {                                                           \
            DbgPrint("TPR mismatch line %d\n", __LINE__);           \
        }                                                           \
    }                                                               \
    else                                                            \
    {                                                               \
        DbgPrint("APIC: Too many patches, Citrix please increase buffer size. (line %d)\n", __LINE__); \
    }

    tmpOffset = 0;
    if (mp)
    {
        tmpOffset = 1;
    }

    if (osType == osW2k3)
    {
        if (osInfo.wServicePackMajor < 2)
        {
            CHECK_PATCH(address + tmpOffset + 0x1e, test_icr_0x1000, 400);
            CHECK_PATCH(address + tmpOffset + 0x41, bt_irr_edx_eax, 401);
        }
    }
    else
    {
        // XP or 2K

        CHECK_PATCH(address + tmpOffset + 0x1c, test_icr_0x1000, 402);
        CHECK_PATCH(address + tmpOffset + 0x2e, test_icr_0x1000, 403);
    }

    if (mp)
    {
        // In the MP case we will also patch tests in HalpSendIpi and
        // HalpSendFlatIpi.  Neither routine is exported so we derive
        // the address as an offset from HalRequestIpi which is (and
        // is the closest export).
        //
        // N.B. When the guest supports more than 8 processors we will
        // need to find the non-flat apic ipi routine.

        status = GetModuleExportAddress(halBase,
                                        "HalRequestIpi",
                                        &address);

        if (NT_SUCCESS(status))
        {
            if (osType == osW2k3) // offsets are the same thru sp2
            {
                address -= 0x100;  // HalpSendFlatIpi

                CHECK_PATCH(address + 0x10, test_icr_ecx, 404);

                address += 0x30;    // HalpSendIpi

                CHECK_PATCH(address + 0x17, test_icr_eax, 405);

                address += 0x40;    // HalpRequestIpiSpecifyVector

                CHECK_PATCH(address + 0x30, test_icr_ebx, 406);
                CHECK_PATCH(address + 0x71, test_icr_ebx, 407);
            }
            else
            {
                // XP and 2K are a bit different in that HalRequestIpi does
                // the work inline where 2k3 computes the destination type
                // (flat mode vs cluster) up front and calls thru a pointer.

                CHECK_PATCH(address + 0x12, test_icr_0x1000, 408);
                CHECK_PATCH(address + 0x2d, test_icr_0x1000, 409);
                CHECK_PATCH(address + 0x99, test_icr_0x1000, 410);
                CHECK_PATCH(address + 0xb5, test_icr_0x1000, 411);
            }
        }
    }

    //
    // All the APIC hals, except the lazy IRQL ones have a version of
    // KeGetCurrentIrql beginning with a read from the TPR.  If this hal
    // doesn't, get out.
    //

    status = GetModuleExportAddress(halBase, 
                                    "KeGetCurrentIrql",
                                    &halKeGetCurrentIrql);

    if (NT_SUCCESS(status))
    {
        //
        // Check if the first instruction of KeGetCurrentIrql is a move from
        // the TPR into eax (the return register).   We do this in a try block
        // incase the address we got from GetModuleExportAddress is bogus
        // (unlikely).
        //
        // hal!KeGetCurrentIrql:
        // a18000feff      mov     eax,dword ptr ds:[FFFE0080h]
        // c1e804          shr     eax,4
        // 0fb6801092a780  movzx   eax,byte ptr hal!HalpVectorToIRQL (80a79210)[eax]
        // c3              ret
        //
        // Note: This is the same in XP UP/MP and 2k3 UP/MP. 2k3 SP2 will fail
        // this test because it does lazy IRQL.
        //

        try
        {
            tprHal = IS_TPR_ACCESS(halKeGetCurrentIrql, load_eax);
        }
        except(1)
        {
            //
            // Catch anything, do nothing.
            //
        }
    }

    //
    // In the Windows 2003 Server SP1 MP ACPI HAL there are 40 locations
    // that access the TPR, these break down into 11 different instructions-
    //
    // code                    mnemonic                            locations   frequency
    // c7058000feffc1000000    mov dword ptr ds:[0FFFE0080h],0C1h  3           many
    // c7058000feff41000000    mov dword ptr ds:[0FFFE0080h],41h   5           very many
    // 890d8000feff            mov dword ptr ds:[0FFFE0080h],ecx   4           very many
    // 89358000feff            mov dword ptr ds:[0FFFE0080h],esi   1           moderate
    // a38000feff              mov dword ptr ds:[FFFE0080h],eax    4           many
    // a18000feff              mov eax,dword ptr ds:[FFFE0080h]    10          very many
    // 8b0d8000feff            mov ecx,dword ptr ds:[0FFFE0080h]   4           many
    // 8b158000feff            mov edx,dword ptr ds:[0FFFE0080h]   5           many
    // 8b358000feff            mov esi,dword ptr ds:[0FFFE0080h]   1           moderate
    // 
    // Locate each patch that will be applied.
    //
        
    if (tprHal)
    {
        // First patch is hal!KeGetCurrentIrql, not because it's the most
        // frequent but because we already looked it up when checking to
        // see if this HAL should be patched (above).
        //

        apic_patches[nr_apic_patches_applied].Base = halKeGetCurrentIrql;
        apic_patches[nr_apic_patches_applied].Type = TPR_load_eax;
        nr_apic_patches_applied++;

        // Next two patches are in KeReleaseQueuedSpinLock, the write of the new IRQL
        // and the read back to ensure the write completed and any eligible interrupts
        // are taken before KeReleaseQueuedSpinLock returns. 
        status = GetModuleExportAddress(halBase,
                                        "KeReleaseQueuedSpinLock",
                                        &address);
        if (NT_SUCCESS(status))
        {
            if (!mp)
            {
                // Same offsets for UP in XP and W2k3.

                tmpOffset = 0x0a;
            }
            else
            {
                if ((osType == osXP)
                 || (osType == osW2k))
                {
                    tmpOffset = 0x31;
                }
                else if (osType == osW2k3)
                {
                    tmpOffset = 0x30;
                }
            }

            if (!IS_TPR_ACCESS(address + tmpOffset, store_ecx))
            {
                //
                // Give up, we don't know what code we're looking at.
                //

                return 0;
            }
            CHECK_PATCH(address + tmpOffset, store_ecx, 1);
            CHECK_PATCH(address + tmpOffset + 6, load_eax, 2);
        }
        else
        {
            //
            // Didn't find KeReleaseQueuedSpinLock?   Really not much point in
            // proceeding. By that I mean, if we can't find this, there's little
            // hope we'll find any of the others, something is seriously wrong.
            //

            return 0;
        }

        // Next two are in KeAcquireInStackQueuedSpinLock, first two instructions 
        // in w2k3, first in XP and then + 0xe.
        if (osType != osW2k)
        {
            status = GetModuleExportAddress(halBase,
                                            "KeAcquireInStackQueuedSpinLock",
                                            &address);
            if (NT_SUCCESS(status))
            {
                CHECK_PATCH(address + 0, load_eax, 3);          // MP/UP same.

                if (osType == osW2k3)
                {
                    tmpOffset = 5;
                } else {
                    tmpOffset = 0xe;
                }
                CHECK_PATCH(address + tmpOffset, store_const_0x41, 4);
            }
        }
        // KfLowerIrql.
        status = GetModuleExportAddress(halBase,
                                        "KfLowerIrql",
                                        &address);
        if (NT_SUCCESS(status))
        {
            CHECK_PATCH(address + 0x0c, store_ecx, 5);      // MP/UP 2k3/XP/2k same.
            CHECK_PATCH(address + 0x12, load_eax, 6);
        }

        // KeAcquireQueuedSpinLock
        status = GetModuleExportAddress(halBase,
                                        "KeAcquireQueuedSpinLock",
                                        &address);
        if (NT_SUCCESS(status))
        {
            if (osType == osXP)
            {
                tmpOffset = 0xf;
            } else {
                // 2k and 2k3 same.
                tmpOffset = 5;
            }
            CHECK_PATCH(address + 0, load_eax, 7);          // MP/UP same.
            CHECK_PATCH(address + tmpOffset, store_const_0x41, 8);
        }

        // KeRaiseIrqlToDpcLevel
        status = GetModuleExportAddress(halBase,
                                        "KeRaiseIrqlToDpcLevel",
                                        &address);
        if (NT_SUCCESS(status))
        {
            CHECK_PATCH(address + 0, load_edx, 9);          // MP/UP 2k3/XP/2k same.
            CHECK_PATCH(address + 6, store_const_0x41, 10);
        }

        // KfRaiseIrql
        status = GetModuleExportAddress(halBase,
                                        "KfRaiseIrql",
                                        &address);
        if (NT_SUCCESS(status))
        {
            CHECK_PATCH(address + 0xa, load_eax, 11);       // MP/UP 2k3/XP/2k same.
            CHECK_PATCH(address + 0xf, store_ecx, 12);
        }

        // KeAcquireQueuedSpinLockRaiseToSynch
        status = GetModuleExportAddress(halBase,
                                        "KeAcquireQueuedSpinLockRaiseToSynch",
                                        &address);
        if (NT_SUCCESS(status))
        {
            if (osType == osW2k3)
            {
                CHECK_PATCH(address + 0, load_eax, 13);

                if (mp)
                {
                    CHECK_PATCH(address + 5, store_const_0xc1, 14);
                }
                else
                {
                    // UP systems post 2K (ie XP and above), SYNCH level is
                    // mapped to DISPATCH level.
                    CHECK_PATCH(address + 5, store_const_0x41, 114);
                }
            }
            else
            {
                // XP/2K
                if (mp)
                {
                    CHECK_PATCH(address + 5, store_const_0xd1, 214); 
                }
                else
                {
                    CHECK_PATCH(address + 0, load_eax, 213);
                    if (osType == osW2k)
                    {
                        CHECK_PATCH(address + 5, store_const_0xd1, 314); 
                    }
                    else
                    {
                        CHECK_PATCH(address + 0xf, store_const_0x41, 215);
                    }
                }
            }
        }

        // KeRaiseIrqlToSynchLevel
        status = GetModuleExportAddress(halBase,
                                        "KeRaiseIrqlToSynchLevel",
                                        &address);
        if (NT_SUCCESS(status))
        {
            CHECK_PATCH(address + 0, load_edx, 15);

            if (mp
             || (osType == osW2k))
            {
                if (osType == osW2k3)
                {
                    CHECK_PATCH(address + 6, store_const_0xc1, 16);
                }
                else
                {
                    // XP/2K
                    CHECK_PATCH(address + 6, store_const_0xd1, 216);
                }
            }
            else
            {
                // UP systems post 2K (ie XP and above), SYNCH level is
                // mapped to DISPATCH level.
                CHECK_PATCH(address + 6, store_const_0x41, 116);
            }
        }

        // HalBeginSystemInterrupt
        status = GetModuleExportAddress(halBase,
                                        "HalBeginSystemInterrupt",
                                        &address);
        if (NT_SUCCESS(status))
        {
            CHECK_PATCH(address + 0x0c, load_ecx, 17);      // MP/UP 2k3/XP/2k same.
            CHECK_PATCH(address + 0x12, store_eax, 18);
        }

        // HalEndSystemInterrupt
        status = GetModuleExportAddress(halBase,
                                        "HalEndSystemInterrupt",
                                        &address);
        if (NT_SUCCESS(status))
        {
            CHECK_PATCH(address + 0x1b, store_ecx, 19);     // MP/UP 2k3/XP/2k same.
            CHECK_PATCH(address + 0x21, load_edx, 20);
            if (mp)
            {
                CHECK_PATCH(address + 0x3c, store_const_0x41, 21);
            }
            else
            {
                CHECK_PATCH(address + 0x3a, store_const_0x41, 121);
            }
        }

        // KfReleaseSpinLock.
        status = GetModuleExportAddress(halBase,
                                        "KfReleaseSpinLock",
                                        &address);
        if (NT_SUCCESS(status))
        {
            if (osType == osW2k3)
            {
                if (mp)
                {
                    tmpOffset = 0xe;
                }
                else
                {
                    tmpOffset = 0xa;
                }
            }
            else
            {
                // XP/2K

                if (mp)
                {
                    tmpOffset = 0xf;
                }
                else
                {
                    tmpOffset = 0xc;
                }
            }
            CHECK_PATCH(address + tmpOffset, store_ecx, 22);
            CHECK_PATCH(address + tmpOffset + 6, load_eax, 23);
        }

        // KfAcquireSpinLock
        status = GetModuleExportAddress(halBase,
                                        "KfAcquireSpinLock",
                                        &address);
        if (NT_SUCCESS(status))
        {
            CHECK_PATCH(address + 0, load_edx, 24);         // MP/UP 2k3/XP/2k same.
            CHECK_PATCH(address + 6, store_const_0x41, 25);
        }

        // KeAcquireInStackQueuedSpinLockRaiseToSynch
        if (osType != osW2k)
        {
            status = GetModuleExportAddress(halBase,
                                            "KeAcquireInStackQueuedSpinLockRaiseToSynch",
                                            &address);
            if (NT_SUCCESS(status))
            {
                if (osType == osW2k3)
                {
                    CHECK_PATCH(address + 0, load_eax, 26);

                    if (mp)
                    {
                        CHECK_PATCH(address + 5, store_const_0xc1, 27);
                    }
                    else
                    {
                        CHECK_PATCH(address + 5, store_const_0x41, 127);
                    }
                }
                else
                {
                    // XP

                    if (mp)
                    {
                        CHECK_PATCH(address + 5, store_const_0xd1, 228);
                    }
                    else
                    {
                        CHECK_PATCH(address + 0, load_eax, 226);
                        CHECK_PATCH(address + 0xe, store_const_0x41, 227);
                    }
                }
            }
        }
        // HalpDispatchInterrupt isn't an export ... so ... the nearest export
        // at this time is KeReleaseQueuedSpinLock which is 0x1e4 bytes earlier.
        status = GetModuleExportAddress(halBase,
                                        //"HalpDispatchInterrupt",
                                        "KeReleaseQueuedSpinLock",
                                        &address);
        if (NT_SUCCESS(status))
        {
            if (osType == osW2k3)
            {
                if (mp)
                {
                    address += 0x1e4;
                }
                else
                {
                    address += 0x178;

                    address -= 1; // UP is identical to MP but one byte earlier.
                }

                // Add offset to first instruction to modify.
                address += 0xa6;

                CHECK_PATCH(address + 0x00, load_esi, 28);  // 0
                CHECK_PATCH(address + 0x06, store_eax, 28); // 6
                CHECK_PATCH(address + 0x1f, store_esi, 29); // 1f
                CHECK_PATCH(address + 0x25, load_ecx, 30);  // 25

                // HalpApcInterrupt isn't an export ... so ... same story but it's
                // another 1ac bytes along.
                //
                // We should subtract out the offset to the first instruction
                // modified, then add in the offset to the first instruction in
                // HalpApcInterrupt ... but it's the same offset at 0xa6 (for
                // W2k3).

                address += 0x1ac; // HalpApcInterrupt - HalpDispatchInterrupt
            }
            else
            {
                if (osType == osXP)
                {
                    if (mp)
                    {
                        address += 0x1e0;
                    }
                    else
                    {
                        address += 0x164;

                        address -= 1; // UP is identical to MP but one byte earlier.
                    }
                    address += 0x9f;  // offset to first instruction to change
                }
                else
                {
                    // W2k

                    if (mp)
                    {
                        address -= 0x504; // HalpDispatchInterrupt - KeReleaseQueuedSpinLock
                        address += 0x89;  // XP offset is 89 MP, 88 UP.

                        if (!IS_TPR_ACCESS(address, load_esi))
                        {
                            // In service pack 4 the difference between the
                            // above two routines has changed.  Let's try 0x4fc
                            // instead (ie 8 bytes closer than it was).

                            address += 0x504 - 0x4fc;
                        }
                    }
                    else
                    {
                        address -= 0x420; // HalpDispatchInterrupt - KeReleaseQueuedSpinLock
                        address += 0x88;
                    }
                }

                CHECK_PATCH(address + 0x00, load_esi, 228);
                CHECK_PATCH(address + 0x06, store_eax, 228);
                CHECK_PATCH(address + 0x1d, store_esi, 229);// <-- slight difference
                CHECK_PATCH(address + 0x23, load_ecx, 230);

                // HalpApcInterrupt

                if (osType == osXP)
                {
                    address += 0x1a4;     // HalpApcInterrupt - HalpDispatchInterrupt
                }
                else
                {
                    // W2k

                    address += 0x18c;     // HalpApcInterrupt - HalpDispatchInterrupt
                }
            }

            // HalpApcInterrupt is identical for mp/up, xp/w2k3 but at slightly
            // different offsets (adjusted for above).

            CHECK_PATCH(address + 0x00, load_ecx, 31);
            CHECK_PATCH(address + 0x07, store_eax, 32);
            CHECK_PATCH(address + 0x29, store_eax, 33);
            CHECK_PATCH(address + 0x2e, load_ecx, 34);
        }

        if ((osType == osXP)
         || (osType == osW2k))
        {
            //
            // XP and older.
            //
            // ExAcquireFastMutex and ExReleaseFastMutex are implemented
            // in the HAL and manipulate the TPR directly.  In W2k3 both are
            // implemented in the kernel and use KfRaiseIrql/KfLowerIrql which
            // are already patched.
            //

            status = GetModuleExportAddress(halBase,
                                            "ExAcquireFastMutex",
                                            &address);
            if (NT_SUCCESS(status))
            {
                CHECK_PATCH(address + 0, load_eax, 241);          // MP/UP same.
                CHECK_PATCH(address + 5, store_const_0x3d, 242);
            }

            status = GetModuleExportAddress(halBase,
                                            "ExReleaseFastMutex",
                                            &address);
            if (NT_SUCCESS(status))
            {
                tmpOffset = mp ? 1 : 0;

                CHECK_PATCH(address + tmpOffset + 0x1a, store_eax, 243);
                CHECK_PATCH(address + tmpOffset + 0x1f, load_ecx, 244);
            }

            // ExTryToAcquireFastMutex, similar story.

            status = GetModuleExportAddress(halBase,
                                            "ExTryToAcquireFastMutex",
                                            &address);
            if (NT_SUCCESS(status))
            {
                CHECK_PATCH(address + 0x05, load_eax, 245);
                CHECK_PATCH(address + 0x0a, store_const_0x3d, 246);

                tmpOffset = mp ? 1 : 0;

                CHECK_PATCH(address + tmpOffset + 0x2f, store_ecx, 247);
                CHECK_PATCH(address + tmpOffset + 0x35, load_eax, 248);
            }

            //
            // XP Uniprocessor, KeReleaseInStackQueuedSpinLock manipulates the
            // TPR directly.  In MP and W2k3, the in stack versions just set
            // the address of the lock and branch into KeReleaseQueuedSpinLock
            // (already patched).
            //

            if ((osType == osXP)
             && (mp == FALSE))
            {
                status = GetModuleExportAddress(halBase,
                                                "KeReleaseInStackQueuedSpinLock",
                                                &address);
                if (NT_SUCCESS(status))
                {
                    CHECK_PATCH(address + 0x0b, store_ecx, 249);
                    CHECK_PATCH(address + 0x11, load_eax, 250);
                }
            }
        }
    }

    newBase = MapApicPatchSites(&size, &remapAdjust);
    if (!newBase) {
        TraceError(("Unable to remap HAL for writing, skipping APIC performance patches.\n"));
        nr_apic_patches_applied = 0;
        return 0;
    }

    // Apply patches.
    oldIrql = QuiesceSystem();
    for (i = 0; i < nr_apic_patches_applied; i++)
    {
        PCHAR m;
        PCHAR t;
        LONG  n;
        ULONG branchOffset;

        m = apic_patches[i].Base;

        //
        // Each instruction will be overwritten with a CALL
        // instruction calling into a trampoline to bounce us into C
        // code to handle the TPR read or write.
        //
        // For "test" or "bt" instructions, the instruction and the following
        // conditional branch are nop'd out and we use an unconditional branch
        // instead of a call instruction below.
        //
        // Start by filling any left over bytes with no-ops.
        //

        t = m + remapAdjust;
        n = TprCode[apic_patches[i].Type].Len - 5;

        //
        // write the call (0xe8) or branch (0xe9) instruction.
        //

        if (apic_patches[i].Type < TPR_HANDLE_MAX)
        {
            *t++ = 0xe8;

            //
            // Compute the IP offset which is from the instruction following
            // the call instruction to the handler for this instruction type
            // and write out the completed instruction.
            //

            branchOffset = TprHandlerEntry[apic_patches[i].Type] - (PUCHAR)(m + 5);
        }
        else
        {
            //
            // It's a branch and the offset is just beyond the instructions
            // we're skipping.
            //

            *t++ = 0xe9;
            branchOffset = n;
        }
        *(PULONG)t = branchOffset;

        //
        // Fill any left over bytes with no-ops.
        //

        t += sizeof(ULONG);

        while (n--)
        {
            *t++ = 0x90;
        }
    }

    UnquiesceSystem(oldIrql);

    MmUnmapIoSpace(newBase, size);

    TraceInfo(("Applied %d APIC performance patches.\n", nr_apic_patches_applied));

#undef CHECK_PATCH

    return 1;
}

void
FASTCALL
_tpr_store(
    unsigned int    NewPriority
    )
{
    unsigned int    oldPriority;
    PAPIC_PAGE      apic;
    int             i;

    //
    // On entry interrupts are disabled.  Determine the virtual address of the
    // local apic (writable) page.
    //

    apic = _g_Apic[KeGetCurrentProcessorNumber()];

    if (!apic)
    {
        /* apic will be null if the processor supports alternative
           TPR access methods, or we're part way thru hibernating or
           we're part way through a hibernation, probably on the way
           resuming, and the apic pages aren't currently mapped.

           If the procssor supports alternative TPR access methods,
           on Intel, the TPR is remapped by h/w and we don't need to
           do anything, on AMD we use the LOCK MOV CR0 instruction
           (Alt Mov CR8) to access CR8 directly.

           If no alternative access methods just use the kernel's known
           mapping address and let Xen sort it out. */

        if (_g_ApicAltMovCR8AMD)
        {
            mov_to_cr8((NewPriority >> 4) & 0xf);
        }
        else

        {
            *(unsigned int *)0xfffe0080 = NewPriority;
        }
        return;
    }

    oldPriority = apic->TPR;
    apic->TPR = NewPriority;

    if (NewPriority < oldPriority)
    {
        unsigned int    pendingInterrupt = 0;
        unsigned int    tprIndex = NewPriority >> 5;

        //
        // We are lowering priority, if there are any pending interrupts at a
        // priority higher than the new level, inject the highest now.
        //
        // Or ... I don't think we have the information here to correctly set
        // apic->TMR so, if there's an interrupt pending let's let the hyper-
        // visor handle it.
        //
        // Note: There is a small window here where apic->IRR could be updated
        // by the hypervisor and we could miss it, that's ok as long as we're
        // letting the HVM hardware pick it up when IF gets set on the way out.
        //
        // Get highest IRR.  No point examining indexes lower than the new
        // priorty's index.
        //

        for (i = 7; i >= (int)tprIndex; i--)
        {
            if (apic->IRR[i])
            {
                //
                // Find highest bit in this dword and turn it into
                // a priority level.
                //

                ULONG bitIndex;

                _BitScanReverse(&bitIndex, apic->IRR[i]);
                pendingInterrupt = ((i << 5) + bitIndex) & 0xf0;

                if (pendingInterrupt > NewPriority)
                {
                    //
                    // Interrupt needs to be delivered, use the old
                    // TPR mechanism to get the hypervisor into the
                    // picture.  The interrupt won't actually be
                    // delivered until we set the interrupt flag
                    // in the return path, we may want to optimize
                    // this to avoid multiple Hv exits.
                    //

                    *(unsigned int *)0xfffe0080 = NewPriority;
                    return;
                }
            }
        }
    }
}

unsigned int
FASTCALL
_tpr_load(void)
{
    //
    // Thread switch must be disabled during the execution of this routine and
    // it is because this routine is always called with interrupts disabled.  
    // This is because a thread switch between the call to get the processor 
    // number and the actual derefenence could result in our reaching into the
    // apic of another processor who's TPR is not relevant.
    //

    PAPIC_PAGE      apic;

    apic = _g_Apic[KeGetCurrentProcessorNumber()];

    if (!apic)
    {
        if (_g_ApicAltMovCR8AMD)
        {
            return mov_from_cr8() << 4;
        }
        else
        {
            return *(unsigned int *)0xfffe0080;
        }
    }

    return apic->TPR;
}

//
// XenSpinYieldCount is in a global so we can tweak it from the debugger
// (XenStore might be more elegant) while attempting to determine the
// right number of iterations before yielding the vcpu.  A quick benchmark
// with two 4 vcpu VMs doing builds of the DDK sources simultaneously
// gave the following results.
//
// 128                00:08:09
// 256                00:07:05
// 512                00:07:32
// 1,000              00:07:09
// 2,048              00:06:33 <- best
// 4,096              00:06:48
// 8,192              00:07:00
// 16,384             00:07:20
// ... trend continues towards 10 seconds ...
// 2,147,483,647      00:10:17
//
LONG XenSpinBeforeYield = 2048;

volatile LONG XenSpinYieldCount;

void
FASTCALL
XenWaitForZeroBit(
    volatile LONG *     Address,
    LONG                TestBit
    )
{
    //
    // Spin on a bit waiting for it to be zero.  If it doesn't become
    // zero in a reasonable number of iterations (the average spinlock,
    // I'm told, is only held for about 400 clocks), call the scheduler
    // and give up this vcpu's time slice.  This should help in when
    // this vcpu is spinning on a resource held by another vcpu which 
    // is waiting for a real cpu to run on.
    //
    int                 spin;

    for (;;)
    {
        for (spin = XenSpinBeforeYield; spin != 0; spin--)
        {
            if ((*Address & TestBit) == 0)
            {
                return;
            }
            _mm_pause();
        }
        InterlockedIncrement(&XenSpinYieldCount);
        HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
    }
}


static volatile LONG * _gp_KiFlushTbTimeStamp;

int XenFlushCount[6];

//#define FLUSH_USE_MMUEXT_OP
#if defined(FLUSH_USE_MMUEXT_OP)
static struct mmuext_op XenFlushMMUEXT_OP;
#endif

int
FASTCALL
_XenFlushEntireTbRaw(void)
{
#if defined(FLUSH_USE_MMUEXT_OP)
    return HYPERVISOR_mmuext_op(&XenFlushMMUEXT_OP);
#else
#define HVMOP_flush_tlb_all     5
    return HYPERVISOR_hvm_op(HVMOP_flush_tlb_all, NULL);
#endif
}


void
FASTCALL
_XenFlushEntireTb(void)
{
    //
    // In Windows 2003 it appears that K*FlushEntireTb is the only place
    // where KiFlushTbTimeStamp is written, apparently that's not true in
    // Vista. 
    //
    // Apparently in Vista, it is written elsewhere because of routines
    // optimized to function with a hypervisor.  If that's the case, we
    // should piggyback on it.
    //
    //
    // I really wish we didn't have to raise IRQL or acquire the time
    // stamp lock but unless we implement it all in the hypervisor we
    // sort gotta hafta.  The raise irql is to avoid getting context
    // switched (or interrupted except by IPI) while we hold this lock.
    //
    // If the lock isn't immediately available, perhaps we should yield
    // the vcpu?
    //
    // Note: IPI is why we can't just cli.
    //
    // Note: In Windows 2003, KiFlushTbTimeStamp is a counter whos
    // least significant bit is a lock bit and the upper 31 bits are
    // a free running count.  Effectively, if the low bit can be set
    // and was previously clear we take the lock.  Incrementing the
    // the locked value releases the lock and increments the count.
    //
    // In Windows XP, the lock bit is the most significant bit so 
    // that trick doesn't work.  Currently, don't do this patch for XP.
    //

    volatile LONG * timeStamp = _gp_KiFlushTbTimeStamp;

#if !defined(SYNCH_LEVEL)
#define SYNCH_LEVEL (PROFILE_LEVEL)
#endif

    KIRQL oldIrql = KfRaiseIrql(SYNCH_LEVEL);

    for (;;)
    {
        XenWaitForZeroBit(timeStamp, 0);
        if (_interlockedbittestandset(timeStamp, 0) == 0)
        {
            // Old value 0, got the lock.
            break;
        }
        _mm_pause();
    }

    _XenFlushEntireTbRaw();

    //
    // Release lock and bump count
    //

    InterlockedIncrement(timeStamp);
    KeLowerIrql(oldIrql);
}

static PUCHAR
PrePatch(PVOID Base, int Length, PKIRQL pIrql, int Comment)
{
    PHYSICAL_ADDRESS physicalBase;
    PUCHAR           newBase;

    //
    // Get write access to the specified memory for the requested length
    //
    // Warning: Assumes contiguous memory, which is the case for the kernel
    // and hal but at some stage we ought to write this to allow for the
    // possibility of discontiguous pages.
    //

    physicalBase = MmGetPhysicalAddress(Base);
    newBase = MmMapIoSpace(physicalBase, Length, MmCached);
    if (newBase == NULL)
    {
        TraceInfo(("XS MP TLB/Spin opt: failed to remap kernel (%d), skipping opt.\n", Comment));
        return NULL;
    }

    *pIrql = QuiesceSystem();
    return newBase;
}

static void
PostPatch(KIRQL Irql, PVOID Base, int Length)
{
    UnquiesceSystem(Irql);
    MmUnmapIoSpace(Base, Length);
}
#define CODE_MATCH(x,y) \
        (RtlCompareMemory((x), (y), sizeof(y)-1) == (sizeof(y)-1))

static int
_PatchTlbFlush()
{
    UNICODE_STRING  functionName;
    PUCHAR          keFlushEntireTb;
    PUCHAR          patchTarget;
    PUCHAR          newBase;
    KIRQL           oldIrql;
    PUCHAR          branchTarget;
    int             hvRc;
    char            pae = FALSE;

    extern UCHAR tpr_kxFlushEntireTbPatch;
    extern UCHAR tpr_keFlushSingleTbPatch;
    extern UCHAR tpr_keFlushMultipleTbPatch;

    //
    // Only applicable if this VM has more than one CPU.  Note, this
    // is different from the APIC code which is applicable if an MP
    // kernel/hal are in use, not the number of CPUs.
    //

    if (KeNumberProcessors == 1)
    {
        TraceInfo(("XS MP TLB flush optimization skipped, current VM is uniproc.\n"));
        return 0;
    }

#if defined(FLUSH_USE_MMUEXT_OP)
    XenFlushMMUEXT_OP.cmd = MMUEXT_TLB_FLUSH_ALL;
#endif

    //
    // In Windows 2003 (at least as far back as sp1) KeFlushEntireTb
    // just calls KxFlushEntireTb the difference being the former 
    // takes an argument specifying whether or not to broadcast
    // the flush to other cpus, which is ignored, and KxFlushEntireTb
    // doesn't have any arguments.
    //
    // KeFlushCurrentTb is used to flush the current processor's TLB
    // only but that is not exported (obviously a driver is not clever
    // enough to make such a distinction).
    //
    // Locate KeFlushEntireTb, because it IS a kernel export.
    //

    RtlInitUnicodeString(&functionName, L"KeFlushEntireTb");
    keFlushEntireTb = (PUCHAR)(ULONG_PTR)
                      MmGetSystemRoutineAddress(&functionName);

    if (keFlushEntireTb == NULL)
    {
        TraceWarning(("XS MP TLB opt: Couldn't locate exported kernel function, skipping opt.\n"));
        return 0;
    }

    //
    // Is the first instruction a call instruction?  If so, it's a
    // call to KxFlushEntireTb.  (The next instruction is a ret 8
    // which we'll check for the sanity's sake).
    //

    if ((keFlushEntireTb[0] == 0xe8)    // call instruction
     && (keFlushEntireTb[5] == 0xc2)    // ret nnnn
     && (keFlushEntireTb[6] == 0x08)    //     nnnn == 8
     && (keFlushEntireTb[7] == 0x00))
    {
        //
        // The entry to KxFlushEntireTb looks like
        //

        UCHAR leadin[] = "\x8b\xff"     //  mov     edi,edi
                         "\x55"         //  push    ebp
                         "\x8b\xec"     //  mov     ebp,esp
                         "\x51"         //  push    ecx
                         "\x53"         //  push    ebx
                         "\x56"         //  push    esi
                         "\x57"         //  push    edi
                         "\xff\x15";    //  call    dword ptr [xxxxxxxx]

        PUCHAR p;

        //
        // If it doesn't, bail.  We need the big fat call instruction to
        // insert a branch into our alternative code and we want to return
        // directly to the caller which means we'll need to restore the non
        // volatiles and unwind the stack correctly.
        //

        LONG offset = *(PLONG)(keFlushEntireTb + 1);
        keFlushEntireTb += 1 + 4 + offset;

        if (!CODE_MATCH(keFlushEntireTb, leadin))
        {
            TraceInfo(("XS MP TLB opt: Code mismatch (%d), skipping opt.\n", 1));
            return 0;
        }

        //
        // Following the call instruction we should have
        //
        // 8845ff          mov     byte ptr [ebp-1],al
        // bbxxxxxxxx      mov     ebx,offset nt!KiTbFlushTimeStamp
        //
        // and we need the address of KiTbFlushTimeStamp as well have
        // to manipulate the timestamp in the patch.
        //

        p = keFlushEntireTb + sizeof(leadin) - 1 + 4;
        if (RtlCompareMemory(p, "\x88\x45\xff\xbb", 4) != 4)
        {
            TraceInfo(("XS MP TLB opt: Code mismatch (%d), skipping opt.\n", 2));
            return 0;
        }
        p += 4;
        _gp_KiFlushTbTimeStamp = (volatile LONG *)(ULONG_PTR)*(PULONG)p;

        patchTarget = keFlushEntireTb + sizeof(leadin) - 3;
        branchTarget = &tpr_kxFlushEntireTbPatch;
    }
    else
    {
#if 0
        //
        // The entry to KeFlushEntireTb in the case of Windows XP SP2 is
        //

        UCHAR leadin[] = "\x8b\xff"     //  mov     edi,edi
                         "\x55"         //  push    ebp
                         "\x8b\xec"     //  mov     ebp,esp
                         "\x80\x7d\x0c\x00"//cmp    byte ptr [ebp+0Ch],0
                         "\x53"         //  push    ebx
                         "\x56"         //  push    esi
                         "\x74\x16"     //  je      nt!KeFlushEntireTb+0x23 (804fb2b7)
                         "\xff\x15"     //  call    dword ptr [xxxxxxxx]

        if (RtlCompareMemory(leadin, keFlushEntireTb, sizeof(leadin)-1) !=
            (sizeof(leadin)-1))
        {
            TraceInfo(("XS MP TLB opt: Code mismatch (%d), skipping opt.\n", 3));
            return 0;
        }

        patchTarget = KeFlushEntireTb + sizeof(leadin) - 3;
        branchTarget = keFlushEntireTbPatch;
#else
        //
        // Don't try to support anything other that Win2k3 for the moment.
        // The timestamp in XP is different, the lock bit is the MSB instead
        // of the LSB and the counter in the low 31 bits (unlike 2k3 where 
        // the counter is in the upper 31 bits and the lock in the lower 
        // which means you can unlock and bump by (atomically) adding one
        // to the locked value.
        //
        // Until we figure out what to do with all that, ignore other
        // systems, the important ones (from a performance POV are running
        // 2k3 anyway).  peterj 8-21-07.
        //
        TraceInfo(("XS MP TLB opt: Doesn't look like 2k3, skipping opt.\n"));
        return 0;
#endif
    }

    //
    // Convert branchTarget from absolute to relative for the jmp instruction.
    //

    branchTarget -= ((ULONG_PTR)patchTarget + 5);


    //
    // Does the hypervisor support this hypercall? 
    //

    if ((hvRc = _XenFlushEntireTbRaw()) != 0)
    {
        TraceInfo(("XS TlbFlush hypercall error %d, ignoring, not optimizing.\n",
                   -hvRc));
        return 0;
    }

    if ((newBase = PrePatch(patchTarget, 6, &oldIrql, 1)) == NULL)
    {
        return 0;
    }

    *newBase = 0xe9;                        // jmp
    *(PULONG)(newBase+1) = (ULONG)(ULONG_PTR)branchTarget;
    *(newBase+5) = 0xcc;                    // int 3 Shouldn't get here.

    PostPatch(oldIrql, newBase, 6);

    //
    // KeFlushSingleTb
    //
    // Really need to develop a sequence matcher.
    //
    // Hack out the call to KiIpiSendPacket in KeFlushSingleTb
    // and replace it with a hypercall to flush all TBs, and then
    // skip the local flush and wait for IPI.
    //
    // There is a test to see if there are other processors to
    // send to and if so, the IpiSendPacket stuff is out of line
    // and looks like-
    //
    // nt!KeFlushSingleTb+0x3b:
    //   6a00            push    0
    //   ff75f8          push    dword ptr [ebp-8]
    //   6a00            push    0
    //   68xxxxxxxx      push    offset nt!KiFlushTargetSingleTb
    //   56              push    esi
    //   e8141dffff      call    nt!KiIpiSendPacket
    //   ebb9            jmp     nt!KeFlushSingleTb+0x4d
    //
    //
    // The code it branches back to looks like-
    //
    // nt!KeFlushSingleTb+0x4d:
    //   8b45f8          mov     eax,dword ptr [ebp-8]
    //   0f0138          invlpg  [eax]
    //   85f6            test    esi,esi
    //   750d            jne     nt!KeFlushSingleTb+0x57
    //
    // where that last branch is to the wait for IPI if there were 
    // other processors to wait for.   We'll just skip that whole
    // sequence.
    //
    // With no image search tools and no export for this routine,
    // we're going to rely on it's offset from the previous patch
    // which was correct for W2k3sp2 as of August 07.
    //
    // This code differs between PAE and non-PAE systems, either because of 
    // BBT or compilier whimsicalness.  Need to handle both.
    //
    // Note: The sequence splits in two because of the absolute 
    // address of KiFlushTargetSingleTb which is pushed.  This
    // will vary depending on kernel load address which can vary.
    //


    {
        UCHAR seq1[] = "\x6a\x00"       // push    0
                       "\xff\x75\xf8"   // push    dword ptr [ebp-8]
                       "\x6a\x00"       // push    0
                       "\x68";          // push    offset nt!KiFlushTargetSingleTb

        UCHAR seq2[] = "\x56"           // push    esi
                       "\xe8\x14\x1d\xff\xff"// call    nt!KiIpiSendPacket
                       "\xeb\xb9";      //jmp     nt!KeFlushSingleTb+0x4d

        UCHAR pae1[] = "\x56"           // push    esi
                       "\xe8";          // call    nt!KiIpiSendPacket

        UCHAR pae2[] = "\x8b\x45\xf8"   // mov     eax,dword ptr [ebp-8]
                       "\x0f\x01\x38"   // invlpg  [eax]
                       "\x85\xf6"       // test    esi,esi
                       "\x74\x23";      // je      nt!KeFlushSingleTb+0x7a

        if (CODE_MATCH(patchTarget + 0x568d, seq1) &&
            CODE_MATCH(patchTarget + 0x568d + sizeof(seq1)-1 + 4 , seq2))
        {
            patchTarget += 0x568d;

            //
            // It matches, life is good.  We're going to patch the branch target 
            // of the call instruction to our own routine (which will take care 
            // to undo the pushes) and we're also going to adjust the jump back
            // instuction to skip the invlpg and the test for multiple CPUs.
            //

            patchTarget += 2 + sizeof(seq1)-1 + 4;
            branchTarget = &tpr_keFlushSingleTbPatch;
            branchTarget -= ((ULONG_PTR)patchTarget + 4);

            if ((newBase = PrePatch(patchTarget, 6, &oldIrql, 2)) == NULL)
            {
                return 1;
            }

            // Adjust call instruction to call keFlushSingleTbPatch.

            *(PULONG)newBase = (ULONG)(ULONG_PTR)branchTarget;

            // Adjust branch back to skip invlpg and test for MP.

            *(newBase+5) += (3 + 3 + 2 + 2);

            PostPatch(oldIrql, newBase, 6);
        }
        else if (CODE_MATCH(patchTarget + 0x254, pae1) && 
                 CODE_MATCH(patchTarget + 0x254 + 6, pae2) &&
                 (*(patchTarget + 0x24f) == 0x68))
        {
            pae = TRUE;
            patchTarget += 0x250;

            //
            // In this case the code is all inline (same as KeFlushMultipleTb
            // below) we patch the call instruction to use our tame version 
            // which will adjust it's return address to skip the invlpg and 
            // test instructions and will ensure the z flag is set (for je)
            // to skip the wait for ipi completion stuff.
            //
            // The amount the return adjustment is given by replacing the
            // pushed offset of KiFlushTargetSingleTb (the IPI target routine).
            //

            branchTarget = &tpr_keFlushMultipleTbPatch;
            branchTarget -= ((ULONG_PTR)patchTarget + 4 + 2 + 4);

            if ((newBase = PrePatch(patchTarget, 10, &oldIrql, 5)) == NULL)
            {
                return 1;
            }

            // Adjust the pushed value so the callee can adjust the return
            // address to skip the unneeded invltb and test for MP.

            *(PULONG)(newBase) = 8;

            // Adjust call instruction to call keFlushMultipleTbPatch.

            *(PULONG)(newBase + 4 + 2) = (ULONG)(ULONG_PTR)branchTarget;

            PostPatch(oldIrql, newBase, 10);
        }
        else
        {
            // Didn't match, bail.
            TraceInfo(("XS MP TLB opt: Code mismatch (%d), skipping opt.\n", 4));
            return 1;
        }
    }

    //
    // KeFlushMultipleTb is fairly similar to KeFlushSingleTb except
    // the IPI stuff is inline instead of out of line.
    //
    // We'll patch the call to KiIpiSendPacket as above but we can't
    // adjust the instruction after that to do the skip for us, so
    // instead we have the patch target adjust its return address by
    // a fixed constant (hopefully we don't end up with too many versions
    // of this).  Kind of gross.
    //

    if (pae == FALSE)
    {
        patchTarget += 0x1248;
    }
    else
    {
        patchTarget -= 0xa1;
    }

    {
        // nt!KeFlushMultipleTb+0x40:
        UCHAR seq1[] = "\x57"           // push    edi
                       "\x53"           // push    ebx
                       "\x6a\x00"       // push    0
                       "\x68";          // push offset nt!KiFlushTargetMultipleTb

        UCHAR seq2[] = "\x56"           // push    esi
                       "\xe8";          // call    nt!KiIpiSendPacket

        UCHAR seq3[] = "\x8b\x07"       // mov     eax,dword ptr [edi]
                       "\x89\x45\x10"   // mov     dword ptr [ebp+10h],eax
                       "\x8b\x45\x10"   // mov     eax,dword ptr [ebp+10h]
                       "\x0f\x01\x38"   // invlpg  [eax]
                       "\x83\xc7\x04"   // add     edi,4
                       "\x3b\xfb"       // cmp     edi,ebx
                       "\x72\xee"       // jb      nt!KeFlushMultipleTb+0x4f
                       "\x85\xf6";      // test    esi,esi

        if (CODE_MATCH(patchTarget       , seq1) &&
            CODE_MATCH(patchTarget + 0x09, seq2) &&
            CODE_MATCH(patchTarget + 0x0f, seq3))
        {
            //
            // We're going to replace the pushed offset with the number of
            // bytes we want to skip on return (callee adjusts return address),
            // then we'll patch the call instruction to our own.
            //

            patchTarget += sizeof(seq1)-1;

            if ((newBase = PrePatch(patchTarget, 10, &oldIrql, 3)) == NULL)
            {
                return 1;
            }

            *(PULONG)newBase = (ULONG)sizeof(seq3)-1;

            branchTarget = &tpr_keFlushMultipleTbPatch;
            branchTarget -= ((ULONG_PTR)patchTarget + 4 + 2 + 4);
            *(PULONG)(newBase + 4 + 2) = (ULONG)(ULONG_PTR)branchTarget;

            PostPatch(oldIrql, newBase, 10);
        }
        else
        {
            // Didn't match, bail.
            TraceInfo(("XS MP TLB opt: Code mismatch (%d), skipping opt.\n", 6));
            return 1;
        }
    }

    //
    // KeFlushProcessTb, same story as KeFlushMultipleTb, although it IS
    // out of line in the non-PAE case, in-line PAE.
    //
    // These offsets are getting a bit large, it might be better to find
    // a closer export.

    if (pae == FALSE)
    {
        patchTarget -= 0x23620;
    }
    else
    {
        patchTarget -= 0x8f;
    }

    {
        // nt!KeFlushProcessTb+0x28
        UCHAR seq1[] = "\x57"           // push    edi
                       "\x57"           // push    edi
                       "\x57"           // push    edi
                       "\x68";          // push offset nt!KiFlushTargetMultipleTb

        UCHAR seq2[] = "\x56"           // push    esi
                       "\xe8";          // call    nt!KiIpiSendPacket

        if (CODE_MATCH(patchTarget, seq1) &&
            CODE_MATCH(patchTarget + sizeof(seq1)-1 + 4, seq2))
        {
            patchTarget += sizeof(seq1)-1;

            if ((newBase = PrePatch(patchTarget, 10, &oldIrql, 4)) == NULL)
            {
                return 1;
            }

            //
            // Set the return address adjustment, in the out of line non-pae case
            // there's a bunch of backwards branching going on and we really just
            // want to skip it all, fortunately there's a copy of the return 
            // sequence there too so we'll just head for that.  In the pae case
            // it's the normal sort of stuff so we just skip down to the branch
            // around the ipi completion stuff having first set eflags to ensure
            // the branch is taken.
            //

            if (pae == FALSE)
            {
                *(PULONG)newBase = (ULONG)-0x78;
            }
            else
            {
                *(PULONG)newBase = 8;
            }

            branchTarget = &tpr_keFlushMultipleTbPatch;
            branchTarget -= ((ULONG_PTR)patchTarget + 4 + 2 + 4);

            // Adjust call instruction to call keFlushMultipleTbPatch.

            *(PULONG)(newBase + 4 + 2) = (ULONG)(ULONG_PTR)branchTarget;

            PostPatch(oldIrql, newBase, 10);
        }
        else
        {
            // Didn't match, bail.
            TraceInfo(("XS MP TLB opt: Code mismatch (%d), skipping opt.\n", 8));
            return 1;
        }
    }

    TraceInfo(("XS MP TLB hypercall optimization applied.\n"));

    return 1;
}


static int
_PatchSpinLocks()
{
    NTSTATUS        status;
    ULONG_PTR       halBase;
    ULONG           halSize;
    UNICODE_STRING  functionName;
    PUCHAR          patchTarget;
    PUCHAR          address;
    PUCHAR          newBase;
    KIRQL           oldIrql;
    PUCHAR          branchTarget;

    extern UCHAR tpr_keAcquireQueuedSpinLockAtDpcLevel;
    extern UCHAR tpr_keAcquireQueuedSpinLock;
    extern UCHAR tpr_keAcquireSpinLockRaiseToSynch;

    UCHAR test_ecx_1[]   = "\xf7\x01\x01\x00\x00\x00";      // test    dword ptr [ecx],1
    UCHAR test_ecxp4_1[] = "\xf7\x41\x04\x01\x00\x00\x00";  // test    dword ptr [ecx+4],1 
    UCHAR test_eaxp4_1[] = "\xf7\x40\x04\x01\x00\x00\x00";  // test    dword ptr [eax+4],1 

    //
    // Only applicable if this VM has more than one CPU.  Note, this
    // is different from the APIC code which is applicable if an MP
    // kernel/hal are in use, not the number of CPUs.
    //

    if (KeNumberProcessors == 1)
    {
        TraceInfo(("XS MP spinlock optimization skipped, current VM is uniproc.\n"));
        return 0;
    }

    status = GetSystemBaseAddress(NULL, NULL, &halBase, &halSize);

    if (!NT_SUCCESS(status))
    {
        TraceError(("XS MP spinlock: Could not find base of hal.dll\n"));
        return 0;
    }


    status = GetModuleExportAddress(halBase,
                                    "KeAcquireSpinLockRaiseToSynch",
                                    &address);

    if (NT_SUCCESS(status))
    {
        // Win2k3 sp2
        patchTarget = address + 0x16;
        if (CODE_MATCH(patchTarget, test_ecx_1))
        {
            branchTarget = &tpr_keAcquireSpinLockRaiseToSynch;
            branchTarget -= ((ULONG_PTR)patchTarget + 5);

            if ((newBase = PrePatch(patchTarget, 6, &oldIrql, 20)) != NULL)
            {
                //
                // Apply patch.
                //

                *newBase = 0xe8;                        // call
                *(PULONG)(newBase+1) = (ULONG)(ULONG_PTR)branchTarget;
                *(newBase+5) = 0x90;                    // nop remaining byte

                PostPatch(oldIrql, newBase, 6);
                TraceNotice(("XS MP spin opt: patch 1 applied.\n"));
            }
            else
            {
                TraceWarning(("XS MP spin opt: failed to remap hal (%d), skipping opt.\n", 1));
            }
        }
        else
        {
            TraceInfo(("patch mismatch %s\n", "KeAcquireSpinLockRaiseToSynch"));
        }
    }

    status = GetModuleExportAddress(halBase,
                                    "KeAcquireQueuedSpinLockRaiseToSynch",
                                    &address);

    if (NT_SUCCESS(status))
    {
        // Win2k3 sp2
        patchTarget = address + 0x35;
        if (CODE_MATCH(patchTarget, test_eaxp4_1))
        {
            branchTarget = &tpr_keAcquireQueuedSpinLock;
            branchTarget -= ((ULONG_PTR)patchTarget + 5);

            if ((newBase = PrePatch(patchTarget, 7, &oldIrql, 21)) != NULL)
            {
                //
                // Apply patch.
                //

                *newBase = 0xe9;                        // jmp
                *(PULONG)(newBase+1) = (ULONG)(ULONG_PTR)branchTarget;
                *(newBase+5) = 0xcc;                    // shouldn't get here
                *(newBase+6) = 0xcc;                    // shouldn't get here

                PostPatch(oldIrql, newBase, 7);
                TraceNotice(("XS MP spin opt: patch 2 applied.\n"));
            }
            else
            {
                TraceWarning(("XS MP spin opt: failed to remap hal (%d), skipping opt.\n", 2));
            }
        }
        else
        {
            TraceInfo(("patch mismatch %s\n", "KeQueuedSpinLockRaiseToSynch"));
        }
    }

    //
    // KeAcquireQueuedSpinLockAtDpcLevel is not exported by the kernel but
    // KeAcquireInStackQueuedSpinLockAtDpcLevel is and after initializing 
    // the private lock structure it branches into the other. 
    //
    // Initially coding for w2k3, we'll just offset.  We could follow the
    // logic and the branch if need be.  Reexamine when modifying this for
    // other guests.
    //

    RtlInitUnicodeString(&functionName, L"KeAcquireInStackQueuedSpinLockAtDpcLevel");
    patchTarget = (PUCHAR)(ULONG_PTR)
                      MmGetSystemRoutineAddress(&functionName);

    if (patchTarget != NULL)
    {
        // Win2k3 sp2
        patchTarget += 0x2b;
        if (CODE_MATCH(patchTarget, test_ecxp4_1))
        {
            branchTarget = &tpr_keAcquireQueuedSpinLockAtDpcLevel;
            branchTarget -= ((ULONG_PTR)patchTarget + 5);

            if ((newBase = PrePatch(patchTarget, 7, &oldIrql, 22)) != NULL)
            {
                //
                // Apply patch.
                //

                *newBase = 0xe9;                        // jmp
                *(PULONG)(newBase+1) = (ULONG)(ULONG_PTR)branchTarget;
                *(newBase+5) = 0xcc;                    // shouldn't get here
                *(newBase+6) = 0xcc;                    // shouldn't get here

                PostPatch(oldIrql, newBase, 7);
                TraceNotice(("XS MP spin opt: patch 3 applied.\n"));
            }
            else
            {
                TraceWarning(("XS MP spin opt: failed to remap kernel (%d), skipping opt.\n", 3));
            }
        }
        else
        {
            TraceInfo(("patch mismatch %s\n", "KeAcquireQueuedSpinLockAtDpcLevel"));
        }
    }
    else
    {
        TraceInfo(("Could not find address of %s\n", "KeAcquireQueuedSpinLockAtDpcLevel"));
    }

    return 1;
}

ULONG  _gp_kdDebuggerEnabled;
ULONG  _g_kdCheckDebuggerTick;

static int
_PatchPollDebugger()
{
    //
    // QEMU has code to detect when the guest is beating on the debugger
    // port which happens when the guest is in the debugger waiting for
    // instructions from the other end.  It can also appear to happen when
    // the clock interrupt is set to occur at a high rate such as during
    // multimedia playback, or when a flash animation is playing.  The
    // debugger is polled on every clock interrupt and because this looks
    // the same as the system being IN the debugger, QEMU delays the response
    // (so that a guest in the debugger is not hogging the whole host),
    // the delay is long enough that another clock interrupt will fire and
    // the guest is effectively hung unable to do anything but process
    // clock interrupts.
    //
    // This patch is on by default but it is only applied if the kernel
    // debugger is attached to the guest.
    //
    // Fortunately, KdPollDebugger is a kernel export in Windows 2000, XP
    // and 2003.   So if KdDebuggerEnabled (boolean) but as the method for
    // accessing exported data changes between 2K and later systems we'll
    // ignore that.
    //
    // We replace the cmp instruction with a call into our own code which
    // will return the appropriate flags .. which means we need to call 
    // into assembly code.
    //

    UNICODE_STRING  functionName;
    PUCHAR          kdPollBreakIn;
    PUCHAR          target = NULL;
    int             patchLength = 7;
    PUCHAR          branchTarget;
    PUCHAR          newBase;
    KIRQL           oldIrql;

    extern UCHAR tpr_kdCheckDebuggerEnabled;

    RtlInitUnicodeString(&functionName, L"KdPollBreakIn");
    kdPollBreakIn = (PUCHAR)(ULONG_PTR)
                    MmGetSystemRoutineAddress(&functionName);
    if (kdPollBreakIn != NULL)
    {
        target = kdPollBreakIn + 9;
        if (*kdPollBreakIn == 0x55)
        {
            //
            // push ebp == old style profile, possibly Windows 2000 profile
            //
            // 55              push    ebp
            // 8bec            mov     ebp,esp
            // 51              push    ecx
            // 51              push    ecx
            // 8065ff00        and     byte ptr [ebp-1],0
            // 803dxxxxxxxx00  cmp     byte ptr [nt!KdDebuggerEnabled],0
            // 745f            je      nt!KdPollBreakIn+0x71
            // e805820000      call    nt!KiDisableInterrupts
            //

            if (*(PUSHORT)target != 0x3d80)
            {
                //
                // No match, no patch.
                //

                return 0;
            }
        }
        else if (*(PUSHORT)kdPollBreakIn == 0xff8b)
        {
            //
            // Possibly XP or 2K3 which differ only in an extra ecx push.
            //
            // 8bff            mov     edi,edi
            // 55              push    ebp
            // 8bec            mov     ebp,esp
            // 51              push    ecx
            // 51              push    ecx <-- XP has extra ecx push
            // 53              push    ebx
            // 33db            xor     ebx,ebx
            // 381dxxxxxxxx    cmp     byte ptr [nt!KdDebuggerEnabled],bl
            // 

            if (*(PUSHORT)target != 0x1d38)
            {
                target++;
                if (*(PUSHORT)target != 0x1d38)
                {
                    //
                    // No match, no patch.
                    //

                    return 0;
                }
            }
            patchLength = 6;

        }

        //
        // Capture address of KdDebuggerEnabled. (NB 32 bit capture)
        //

        _gp_kdDebuggerEnabled = *(PULONG)(target + 2);

        //
        // If the debugger is not enabled don't bother with this patch.
        //

        if ((*(PUCHAR)_gp_kdDebuggerEnabled) == 0)
        {
            return 0;
        }

        branchTarget = &tpr_kdCheckDebuggerEnabled;
        branchTarget -= ((ULONG_PTR)target + 5);

        //
        // Apply patch.
        //

        if ((newBase = PrePatch(target, patchLength, &oldIrql, 23)) == NULL)
        {
            TraceWarning(("XS KDPOLL: failed to remap kernel, skipping.\n"));
            return 0;
        }
        
        //
        // Replace existing cmp instruction (either 6 or 7 bytes) with
        // a call to our helper and fill with no-ops.  (Always put 2 no-ops
        // in the last 2 bytes of the patch area, if only 6 bytes, the first
        // will be overwritten by the branch offset when it is written).
        //

        *newBase = 0xe8;
        *(newBase + patchLength - 2) = 0x90;
        *(newBase + patchLength - 1) = 0x90;
        *(PULONG)(newBase + 1) = (ULONG_PTR)branchTarget;

        PostPatch(oldIrql, newBase, patchLength);

        TraceNotice(("XS KDPOLL optimization applied.\n"));

        return 1;
    }
    return 0;
}


static int
_Patch2KIdleDelay()
{
    ULONG_PTR   kernelBase;
    ULONG       kernelSize;
    NTSTATUS    status;
    PCHAR       PoSetPowerState;
    PULONG      PopIdle0PromoteTicks;

    //
    // If not Windows 2000 or if patch already applied, get out.
    //

    if ((osType != osW2k)
     || (_g_BinPatchFeatureInstalled & DEBUG_PATCH_2K_IDLE_DELAY))
    {
        return 0;
    }

    status = GetSystemBaseAddress(&kernelBase,
                                  &kernelSize,
                                  NULL,
                                  NULL);
    if (!NT_SUCCESS(status))
    {
        TraceError(("Cound not find base of ntoskrnl for 2K idle delay patch\n"));
        return 0;
    }

    status = GetModuleExportAddress(kernelBase,
                                    "PoSetPowerState",
                                    &PoSetPowerState);
    if (!NT_SUCCESS(status))
    {
        TraceError(("Cound not find address of nt!PoSetPowerState for 2k idle delay patch\n"));
        return 0;
    }

    //
    // When a guest processor is idle we want it to execute a HLT instruction
    // as soon as possible to the hypervisor isn't dedicating cycles to a VM
    // that doesn't want them.  In Windows 2000, the system doesn't halt 
    // immediately when idle on the grounds that when it first goes idle it
    // is quite likely to have something to do almost immediately.  The logic
    // here was to avoid having another processor have to send an IPI to wake
    // the first guy up.  There are even benchmarks supporting this .. but in
    // a virtual world it's a really bad idea (also in a hyperthreaded world).
    //
    // As the system boots it calculates the number of ticks a processor should
    // idle for before it halts and sticks this in a global variable.  We need
    // to find the address of that global, PopIdle0PromoteTicks and zap it.
    // Unfortunately it's not exported, nor is the routine that sets it nor
    // the one that uses it.  Fortunately PoSetPowerState is and that's not
    // far away.  In particular, we're looking for this code-
    //
    // nt!PopDemoteIdleness+0x44:

#define PopPromoteIdlenessPlus44 \
       "\x8b\x55\xf8"             /* mov     edx,dword ptr [ebp-8]     */ \
       "\xf0\x0f\xb1\x11"         /* lock cmpxchg dword ptr [ecx],edx  */ \
       "\x39\x45\xfc"             /* cmp     dword ptr [ebp-4],eax     */ \
       "\x75\xdf"                 /* jne     nt!PopDemoteIdleness+0x2f */ \
       "\x8b\x86\xb4\xf8\xff\xff" /* mov     eax,dword ptr [esi-74Ch]  */ \
       "\x8b\x80\x38\x01\x00\x00" /* mov     eax,dword ptr [eax+138h]  */ \
       "\x03\x05"                 /* add     eax,dword ptr [nt!PopIdle0PromoteTicks] */
    //
    // specifically the 4 bytes following the above.  Once we have that we just
    // set it to the value we want.  This assumes we're run after the Po init
    // code which we are.  Also, this could be reset by the system if the
    // interrupt rate is changed, an alternative approach would be to nop the
    // above add instruction but we'll try the easier approach first. plj 06/08
    //
    // In theory, we ought to find PopPromoteIdleness+44 at PoSetPowerState+0x4c4,
    // at least that's where it is with 2K SP4.
    //
    // Note: The code in the uniprocessor kernel is different and won't
    // match but uniprocessor is much more aggressive about halting so
    // we don't need to patch it anyway.
    //

    if ((RtlCompareMemory(PoSetPowerState + 0x4c4,
                          PopPromoteIdlenessPlus44,
                          sizeof(PopPromoteIdlenessPlus44) - 1)) !=
            (sizeof(PopPromoteIdlenessPlus44) - 1))
    {
        TraceInfo(("XS 2K Idle Delay opt: code mismatch, skipping.\n"));
        return 0;
    }

    PopIdle0PromoteTicks = *(PULONG *)(PoSetPowerState + 0x4c4 +
                                       sizeof(PopPromoteIdlenessPlus44) - 1);

    TraceInfo(("XS 2K Idle Delay opt: was %d, reset to 0.\n", 
                *PopIdle0PromoteTicks));

    // Finally, do the deed.
    *PopIdle0PromoteTicks = 0;
    return 1;
}


/* Hack: when resuming from hibernation, we need to find out the
   current Windows version from high IRQL in order to decide which
   binpatches we're going to apply.  Cache the results so that they're
   available when they're needed. */

typedef
NTSTATUS
(*PFN_RTL_GET_VERSION)(
    IN OUT PRTL_OSVERSIONINFOW
    );


typedef
BOOLEAN
(*PFN_PS_GET_VERSION)(
    OUT PULONG,
    OUT PULONG,
    OUT PULONG,
    IN OUT PUNICODE_STRING
    );


VOID
XenutilGetVersionInfo(PRTL_OSVERSIONINFOEXW out)
{
    UNICODE_STRING functionName;
    PFN_RTL_GET_VERSION rtlGetVersion;
    PFN_PS_GET_VERSION psGetVersion;
    static RTL_OSVERSIONINFOEXW info;
    static int have_info;

    if (have_info) {
        *out = info;
        return;
    }

    RtlZeroMemory(&info, sizeof(info));
    info.dwOSVersionInfoSize = sizeof(info);

    // Note: MmGetSystemRoutineAddress is supported in Windows 2000
    // and later systems.  RtlGetVersion was added in Windows XP.
    RtlInitUnicodeString(&functionName, L"RtlGetVersion");
    rtlGetVersion = (PFN_RTL_GET_VERSION)(ULONG_PTR)
                    MmGetSystemRoutineAddress(&functionName);

    if (rtlGetVersion != NULL)
    {
        rtlGetVersion((PRTL_OSVERSIONINFOW)&info);
    }
    else
    {
        // Must be pre XP (ie must be Windows 2000).  We could just
        // set the information but let's slurp it from PsGetVersion.
        // Note that PsGetVersion is obsolete so use
        // MmGetSystemRoutinAddress again rather than calling it
        // directly and risk some more modern system failing to load
        // this driver.

        RtlInitUnicodeString(&functionName, L"PsGetVersion");
        psGetVersion = (PFN_PS_GET_VERSION)(ULONG_PTR)
                       MmGetSystemRoutineAddress(&functionName);
        if (psGetVersion != NULL)
        {
            ULONG major;
            ULONG minor;
            ULONG buildNumber;

            psGetVersion(&major, &minor, &buildNumber, NULL);

            info.dwMajorVersion = major;
            info.dwMinorVersion = minor;
            info.dwBuildNumber  = buildNumber;
        }
        else
        {
            info.dwMajorVersion = 5; // fake it, windows 2000.
            info.dwMinorVersion = 0;
        }
    }
    have_info = 1;
    *out = info;
    return;
}

static VOID
BinPatch(void)
{
    // Determine the OS and HAL.
    XenutilGetVersionInfo(&osInfo);

    // Note: Major version for Vista is 6 but we don't have patches for
    // it yet.

    if (osInfo.dwMajorVersion == 5)
    {
        switch (osInfo.dwMinorVersion)
        {
        case 0:
            osType = osW2k;
            break;
        case 1:
            osType = osXP;
            break;
        case 2:
            osType = osW2k3;
            break;
        }
    }

#define XenPVFeatureEnabled(x) (1)
    if (XenPVFeatureEnabled(DEBUG_PATCH_APIC))
    {
        //
        // Determine if the AMD AltMovCR8 feature is available, if so, it can
        // be used instead of moves to/from the APIC TPR.  Obviously 32 bit 
        // only as 64 uses CR8 directly.
        //

        ULONG eax, ebx, ecx, edx;

        // Note: We should check if extended CPUID leaves are available on
        // this processor, but Xen, and hence this code, doesn't run on any
        // processor which doesn't have them, so skip directly to getting
        // the extended feature bits.

        _cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
        if ((ecx & 0x10) != 0)
        {
            TraceNotice(("Using alternative mov to cr8 instruction\n"));
            _g_ApicAltMovCR8AMD = TRUE;
        }
        else
        {
            _g_ApicAltMovCR8AMD = FALSE;
        }

        // Note: No need to MapApicPages if AltMovCR8 is supported.

        if ((_g_ApicAltMovCR8AMD || MapApicPages()) && _PatchApic())
        {
            _g_BinPatchFeatureInstalled |= DEBUG_PATCH_APIC;
        }
    }

    if (XenPVFeatureEnabled(DEBUG_PATCH_TLB_FLUSH))
    {
        if (_PatchTlbFlush())
        {
            _g_BinPatchFeatureInstalled |= DEBUG_PATCH_TLB_FLUSH;
        }
    }

    if (XenPVFeatureEnabled(DEBUG_PATCH_SPINLOCKS))
    {
        if (_PatchSpinLocks())
        {
            _g_BinPatchFeatureInstalled |= DEBUG_PATCH_SPINLOCKS;
        }
    }

    if (XenPVFeatureEnabled(DEBUG_PATCH_KD_POLL))
    {
        if (_PatchPollDebugger())
        {
            _g_BinPatchFeatureInstalled |= DEBUG_PATCH_KD_POLL;
        }
    }

    if (XenPVFeatureEnabled(DEBUG_PATCH_2K_IDLE_DELAY))
    {
        if (_Patch2KIdleDelay())
        {
            _g_BinPatchFeatureInstalled |= DEBUG_PATCH_2K_IDLE_DELAY;
        }
    }
    _g_BinPatchPhaseComplete = 1;
}


VOID
xpUnload(
    __in PDRIVER_OBJECT DriverObject
    )
{
    DbgPrint("XenPatch driver unload called, this is probably fatal!\n");
    return;
}


NTSTATUS
DriverEntry(
    __in PDRIVER_OBJECT  DriverObject,
    __in PUNICODE_STRING RegistryPath
    )
/*++

Routine Description:

    Installable driver initialization entry point.
    This entry point is called directly by the I/O system.

Arguments:

    DriverObject - pointer to the driver object

    registryPath - pointer to a unicode string representing the path,
                   to driver-specific key in the registry.

Return Value:

    STATUS_SUCCESS if successful,
    STATUS_UNSUCCESSFUL otherwise

--*/
{
    NTSTATUS            status = STATUS_SUCCESS;
    UNICODE_STRING      unicodeDeviceName;
    UNICODE_STRING      unicodeDosDeviceName;
    PDEVICE_OBJECT      devObj;
    HANDLE              threadHandle;
    UNICODE_STRING      unicodeString;

    UNREFERENCED_PARAMETER (RegistryPath);

    DbgPrint("DriverEntry XenPatch Enter \n");

#if 0
    (void)RtlInitUnicodeString(&unicodeDeviceName, DRIVER_NAME);

    //
    // We will create a secure deviceobject so that only processes running
    // in admin and local system account can access the device. Refer
    // "Security Descriptor String Format" section in the platform
    // SDK documentation to understand the format of the sddl string.
    // We need to do because this is a legacy driver and there is no INF
    // involved in installing the driver. For PNP drivers, security descriptor
    // is typically specified for the FDO in the INF file.
    //
    // Security Descriptor
    //
    // D: means it's a DACL (Discretionary Access Control List), 
    // P  means it's protected.
    //
    // ACEs are enclosed in parameters and have 6 fields
    //  ACE type                                A       Allowed
    //  ACE flags                               .
    //  Permission                              GA      Generic All
    //  Object Type                             .
    //  Inherited Object Type                   .
    //  Trustee                                 BA      Built-in Administrators
    //
    // Details http://msdn.microsoft.com/en-us/library/aa379567(VS.85).aspx
    // http://blogs.dirteam.com/blogs/jorge/archive/2008/03/26/parsing-sddl-strings.aspx
    //

    (void)RtlInitUnicodeString(&unicodeString, L"D:P(A;;GA;;;SY)(A;;GA;;;BA)");

    status = IoCreateDeviceSecure(
                DriverObject,
                0,
                &unicodeDeviceName,
                FILE_DEVICE_UNKNOWN,
                FILE_DEVICE_SECURE_OPEN,
                (BOOLEAN) FALSE,
                &unicodeString,
                (LPCGUID)&GUID_DEVCLASS_XENPATCH,
                &devObj
                );
    if (!NT_SUCCESS(status))
    {
        return status;
    }

    DbgPrint("DeviceObject %p\n", devObj);

    //
    // Allocate and initialize a Unicode String containing the Win32 name
    // for our device.
    //

    RtlInitUnicodeString(&unicodeDosDeviceName, DRIVER_NAME);

    status = IoCreateSymbolicLink(
                (PUNICODE_STRING) &unicodeDosDeviceName,
                (PUNICODE_STRING) &unicodeDeviceName
                );

    if (!NT_SUCCESS(status))
    {
        IoDeleteDevice(devObj);
        return status;
    }
#endif

#if 0
    DriverObject->MajorFunction[IRP_MJ_CREATE]=
    DriverObject->MajorFunction[IRP_MJ_CLOSE] = xpCreateClose;
    DriverObject->MajorFunction[IRP_MJ_CLEANUP] = xpCleanup;
#endif

    DriverObject->DriverUnload = xpUnload;

#if 0
    XenevtchnInitIoHole(MemBase, MemBaseVa, NBytes);
    status = EvtchnStart();
#endif

    status = InitHvm();
    if (!NT_SUCCESS(status))
        DbgPrint("XenPatch failed to init Event Channel.\n");
    else
        BinPatch();
    DbgPrint("DriverEntry XenPatch Exit = %x\n", status);

    return status;
}


