/*
 * vmx.c: handling VMX architecture-related VM exits
 * Copyright (c) 2004, Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 * Place - Suite 330, Boston, MA 02111-1307 USA.
 *
 */

#include <xen/config.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/trace.h>
#include <xen/sched.h>
#include <xen/irq.h>
#include <xen/softirq.h>
#include <xen/domain_page.h>
#include <xen/hypercall.h>
#include <xen/perfc.h>
#include <asm/current.h>
#include <asm/io.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/types.h>
#include <asm/msr.h>
#include <asm/spinlock.h>
#include <asm/paging.h>
#include <asm/p2m.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
#include <asm/hvm/vmx/vmx.h>
#include <asm/hvm/vmx/vmcs.h>
#include <asm/hvm/vmx/cpu.h>
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <asm/hvm/vpic.h>
#include <asm/hvm/vlapic.h>
#include <asm/x86_emulate.h>
#include <asm/hvm/vpt.h>
#include <public/hvm/save.h>
#include <asm/hvm/trace.h>

char *vmx_msr_bitmap;

static void vmx_ctxt_switch_from(struct vcpu *v);
static void vmx_ctxt_switch_to(struct vcpu *v);

static int  vmx_alloc_vlapic_mapping(struct domain *d);
static void vmx_free_vlapic_mapping(struct domain *d);
static void vmx_install_vlapic_mapping(struct vcpu *v);

static int vmx_domain_initialise(struct domain *d)
{
    return vmx_alloc_vlapic_mapping(d);
}

static void vmx_domain_destroy(struct domain *d)
{
    vmx_free_vlapic_mapping(d);
}

static int vmx_vcpu_initialise(struct vcpu *v)
{
    int rc;

    spin_lock_init(&v->arch.hvm_vmx.vmcs_lock);

    v->arch.schedule_tail    = vmx_do_resume;
    v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
    v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;

    if ( (rc = vmx_create_vmcs(v)) != 0 )
    {
        dprintk(XENLOG_WARNING,
                "Failed to create VMCS for vcpu %d: err=%d.\n",
                v->vcpu_id, rc);
        return rc;
    }

    vmx_install_vlapic_mapping(v);

    return 0;
}

static void vmx_vcpu_destroy(struct vcpu *v)
{
    vmx_destroy_vmcs(v);
}

static int vmx_paging_enabled(struct vcpu *v)
{
    unsigned long cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
    return (cr0 & (X86_CR0_PE | X86_CR0_PG)) == (X86_CR0_PE | X86_CR0_PG);
}

static int vmx_pgbit_test(struct vcpu *v)
{
    unsigned long cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
    return cr0 & X86_CR0_PG;
}

static int vmx_pae_enabled(struct vcpu *v)
{
    unsigned long cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
    return vmx_paging_enabled(v) && (cr4 & X86_CR4_PAE);
}

static int vmx_nx_enabled(struct vcpu *v)
{
    return v->arch.hvm_vmx.efer & EFER_NX;
}

#ifdef __x86_64__

static int vmx_lme_is_set(struct vcpu *v)
{
    return v->arch.hvm_vmx.efer & EFER_LME;
}

static int vmx_long_mode_enabled(struct vcpu *v)
{
    return v->arch.hvm_vmx.efer & EFER_LMA;
}

static void vmx_enable_long_mode(struct vcpu *v)
{
    unsigned long vm_entry_value;

    vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
    vm_entry_value |= VM_ENTRY_IA32E_MODE;
    __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);

    v->arch.hvm_vmx.efer |= EFER_LMA;
}

static void vmx_disable_long_mode(struct vcpu *v)
{
    unsigned long vm_entry_value;

    vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
    vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
    __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);

    v->arch.hvm_vmx.efer &= ~EFER_LMA;
}

static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state);

static u32 msr_index[VMX_MSR_COUNT] =
{
    MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK
};

static void vmx_save_host_msrs(void)
{
    struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
    int i;

    for ( i = 0; i < VMX_MSR_COUNT; i++ )
        rdmsrl(msr_index[i], host_msr_state->msrs[i]);
}

#define WRITE_MSR(address)                                              \
        guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
        set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags);    \
        wrmsrl(MSR_ ## address, msr_content);                           \
        set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags);     \
        break

static int long_mode_do_msr_read(struct cpu_user_regs *regs)
{
    u64 msr_content = 0;
    u32 ecx = regs->ecx;
    struct vcpu *v = current;
    struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;

    switch ( ecx ) {
    case MSR_EFER:
        msr_content = v->arch.hvm_vmx.efer;
        break;

    case MSR_FS_BASE:
        msr_content = __vmread(GUEST_FS_BASE);
        goto check_long_mode;

    case MSR_GS_BASE:
        msr_content = __vmread(GUEST_GS_BASE);
        goto check_long_mode;

    case MSR_SHADOW_GS_BASE:
        msr_content = v->arch.hvm_vmx.shadow_gs;
    check_long_mode:
        if ( !(vmx_long_mode_enabled(v)) )
        {
            vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
            return 0;
        }
        break;

    case MSR_STAR:
        msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR];
        break;

    case MSR_LSTAR:
        msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR];
        break;

    case MSR_CSTAR:
        msr_content = v->arch.hvm_vmx.cstar;
        break;

    case MSR_SYSCALL_MASK:
        msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
        break;

    default:
        return 0;
    }

    HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);

    regs->eax = (u32)(msr_content >>  0);
    regs->edx = (u32)(msr_content >> 32);

    return 1;
}

static int long_mode_do_msr_write(struct cpu_user_regs *regs)
{
    u64 msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
    u32 ecx = regs->ecx;
    struct vcpu *v = current;
    struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state;
    struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);

    HVM_DBG_LOG(DBG_LEVEL_0, "msr 0x%x content 0x%"PRIx64, ecx, msr_content);

    switch ( ecx )
    {
    case MSR_EFER:
        /* offending reserved bit will cause #GP */
        if ( (msr_content & ~(EFER_LME | EFER_LMA | EFER_NX | EFER_SCE)) ||
             (!cpu_has_nx && (msr_content & EFER_NX)) ||
             (!cpu_has_syscall && (msr_content & EFER_SCE)) )
        {
            gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
                     "EFER: %"PRIx64"\n", msr_content);
            goto gp_fault;
        }

        if ( (msr_content & EFER_LME)
             &&  !(v->arch.hvm_vmx.efer & EFER_LME) )
        {
            if ( unlikely(vmx_paging_enabled(v)) )
            {
                gdprintk(XENLOG_WARNING,
                         "Trying to set EFER.LME with paging enabled\n");
                goto gp_fault;
            }
        }
        else if ( !(msr_content & EFER_LME)
                  && (v->arch.hvm_vmx.efer & EFER_LME) )
        {
            if ( unlikely(vmx_paging_enabled(v)) )
            {
                gdprintk(XENLOG_WARNING,
                         "Trying to clear EFER.LME with paging enabled\n");
                goto gp_fault;
            }
        }

        if ( (msr_content ^ v->arch.hvm_vmx.efer) & (EFER_NX|EFER_SCE) )
            write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
                       (msr_content & (EFER_NX|EFER_SCE)));

        v->arch.hvm_vmx.efer = msr_content;
        break;

    case MSR_FS_BASE:
    case MSR_GS_BASE:
    case MSR_SHADOW_GS_BASE:
        if ( !vmx_long_mode_enabled(v) )
            goto gp_fault;

        if ( !is_canonical_address(msr_content) )
            goto uncanonical_address;

        if ( ecx == MSR_FS_BASE )
            __vmwrite(GUEST_FS_BASE, msr_content);
        else if ( ecx == MSR_GS_BASE )
            __vmwrite(GUEST_GS_BASE, msr_content);
        else
        {
            v->arch.hvm_vmx.shadow_gs = msr_content;
            wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
        }

        break;

    case MSR_STAR:
        WRITE_MSR(STAR);

    case MSR_LSTAR:
        if ( !is_canonical_address(msr_content) )
            goto uncanonical_address;
        WRITE_MSR(LSTAR);

    case MSR_CSTAR:
        if ( !is_canonical_address(msr_content) )
            goto uncanonical_address;
        v->arch.hvm_vmx.cstar = msr_content;
        break;

    case MSR_SYSCALL_MASK:
        WRITE_MSR(SYSCALL_MASK);

    default:
        return 0;
    }

    return 1;

 uncanonical_address:
    HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
 gp_fault:
    vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
    return 0;
}

/*
 * To avoid MSR save/restore at every VM exit/entry time, we restore
 * the x86_64 specific MSRs at domain switch time. Since these MSRs
 * are not modified once set for para domains, we don't save them,
 * but simply reset them to values set in percpu_traps_init().
 */
static void vmx_restore_host_msrs(void)
{
    struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state);
    int i;

    while ( host_msr_state->flags )
    {
        i = find_first_set_bit(host_msr_state->flags);
        wrmsrl(msr_index[i], host_msr_state->msrs[i]);
        clear_bit(i, &host_msr_state->flags);
    }

    if ( cpu_has_nx && !(read_efer() & EFER_NX) )
        write_efer(read_efer() | EFER_NX);
}

static void vmx_save_guest_msrs(struct vcpu *v)
{
    /* MSR_SHADOW_GS_BASE may have been changed by swapgs instruction. */
    rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
}

static void vmx_restore_guest_msrs(struct vcpu *v)
{
    struct vmx_msr_state *guest_msr_state, *host_msr_state;
    unsigned long guest_flags;
    int i;

    guest_msr_state = &v->arch.hvm_vmx.msr_state;
    host_msr_state = &this_cpu(host_msr_state);

    wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);

    guest_flags = guest_msr_state->flags;

    while ( guest_flags )
    {
        i = find_first_set_bit(guest_flags);

        HVM_DBG_LOG(DBG_LEVEL_2,
                    "restore guest's index %d msr %x with value %lx",
                    i, msr_index[i], guest_msr_state->msrs[i]);
        set_bit(i, &host_msr_state->flags);
        wrmsrl(msr_index[i], guest_msr_state->msrs[i]);
        clear_bit(i, &guest_flags);
    }

    if ( (v->arch.hvm_vmx.efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
    {
        HVM_DBG_LOG(DBG_LEVEL_2,
                    "restore guest's EFER with value %lx",
                    v->arch.hvm_vmx.efer);
        write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
                   (v->arch.hvm_vmx.efer & (EFER_NX | EFER_SCE)));
    }
}

#else  /* __i386__ */

static int vmx_lme_is_set(struct vcpu *v)
{ return 0; }
static int vmx_long_mode_enabled(struct vcpu *v)
{ return 0; }
static void vmx_enable_long_mode(struct vcpu *v)
{ BUG(); }
static void vmx_disable_long_mode(struct vcpu *v)
{ BUG(); }

#define vmx_save_host_msrs()        ((void)0)

static void vmx_restore_host_msrs(void)
{
    if ( cpu_has_nx && !(read_efer() & EFER_NX) )
        write_efer(read_efer() | EFER_NX);
}

#define vmx_save_guest_msrs(v)      ((void)0)

static void vmx_restore_guest_msrs(struct vcpu *v)
{
    if ( (v->arch.hvm_vmx.efer ^ read_efer()) & EFER_NX )
    {
        HVM_DBG_LOG(DBG_LEVEL_2,
                    "restore guest's EFER with value %lx",
                    v->arch.hvm_vmx.efer);
        write_efer((read_efer() & ~EFER_NX) |
                   (v->arch.hvm_vmx.efer & EFER_NX));
    }
}

static int long_mode_do_msr_read(struct cpu_user_regs *regs)
{
    u64 msr_content = 0;
    struct vcpu *v = current;

    switch ( regs->ecx ) {
    case MSR_EFER:
        msr_content = v->arch.hvm_vmx.efer;
        break;

    default:
        return 0;
    }

    regs->eax = msr_content >>  0;
    regs->edx = msr_content >> 32;

    return 1;
}

static int long_mode_do_msr_write(struct cpu_user_regs *regs)
{
    u64 msr_content = regs->eax | ((u64)regs->edx << 32);
    struct vcpu *v = current;

    switch ( regs->ecx )
    {
    case MSR_EFER:
        /* offending reserved bit will cause #GP */
        if ( (msr_content & ~EFER_NX) ||
             (!cpu_has_nx && (msr_content & EFER_NX)) )
        {
            gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
                     "EFER: %"PRIx64"\n", msr_content);
            vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
            return 0;
        }

        if ( (msr_content ^ v->arch.hvm_vmx.efer) & EFER_NX )
            write_efer((read_efer() & ~EFER_NX) | (msr_content & EFER_NX));

        v->arch.hvm_vmx.efer = msr_content;
        break;

    default:
        return 0;
    }

    return 1;
}

#endif /* __i386__ */

static int vmx_guest_x86_mode(struct vcpu *v)
{
    unsigned int cs_ar_bytes;

    ASSERT(v == current);

    if ( unlikely(!(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_PE)) )
        return 0;
    if ( unlikely(__vmread(GUEST_RFLAGS) & X86_EFLAGS_VM) )
        return 1;
    cs_ar_bytes = __vmread(GUEST_CS_AR_BYTES);
    if ( vmx_long_mode_enabled(v) &&
         likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
        return 8;
    return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2);
}

static void vmx_save_dr(struct vcpu *v)
{
    if ( !v->arch.hvm_vcpu.flag_dr_dirty )
        return;

    /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
    v->arch.hvm_vcpu.flag_dr_dirty = 0;
    v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING;
    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);

    v->arch.guest_context.debugreg[0] = read_debugreg(0);
    v->arch.guest_context.debugreg[1] = read_debugreg(1);
    v->arch.guest_context.debugreg[2] = read_debugreg(2);
    v->arch.guest_context.debugreg[3] = read_debugreg(3);
    v->arch.guest_context.debugreg[6] = read_debugreg(6);
    /* DR7 must be saved as it is used by vmx_restore_dr(). */
    v->arch.guest_context.debugreg[7] = __vmread(GUEST_DR7);
}

static void __restore_debug_registers(struct vcpu *v)
{
    ASSERT(!v->arch.hvm_vcpu.flag_dr_dirty);
    v->arch.hvm_vcpu.flag_dr_dirty = 1;

    write_debugreg(0, v->arch.guest_context.debugreg[0]);
    write_debugreg(1, v->arch.guest_context.debugreg[1]);
    write_debugreg(2, v->arch.guest_context.debugreg[2]);
    write_debugreg(3, v->arch.guest_context.debugreg[3]);
    write_debugreg(6, v->arch.guest_context.debugreg[6]);
    /* DR7 is loaded from the VMCS. */
}

/*
 * DR7 is saved and restored on every vmexit.  Other debug registers only
 * need to be restored if their value is going to affect execution -- i.e.,
 * if one of the breakpoints is enabled.  So mask out all bits that don't
 * enable some breakpoint functionality.
 */
#define DR7_ACTIVE_MASK 0xff

static void vmx_restore_dr(struct vcpu *v)
{
    /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */
    if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) )
        __restore_debug_registers(v);
}

void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c)
{
    uint32_t ev;

    vmx_vmcs_enter(v);

    c->rip = __vmread(GUEST_RIP);
    c->rsp = __vmread(GUEST_RSP);
    c->rflags = __vmread(GUEST_RFLAGS);

    c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
    c->cr2 = v->arch.hvm_vmx.cpu_cr2;
    c->cr3 = v->arch.hvm_vmx.cpu_cr3;
    c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;

    c->msr_efer = v->arch.hvm_vmx.efer;

#ifdef HVM_DEBUG_SUSPEND
    printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
           __func__, c->cr3, c->cr0, c->cr4);
#endif

    c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
    c->idtr_base = __vmread(GUEST_IDTR_BASE);

    c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
    c->gdtr_base = __vmread(GUEST_GDTR_BASE);

    c->cs_sel = __vmread(GUEST_CS_SELECTOR);
    c->cs_limit = __vmread(GUEST_CS_LIMIT);
    c->cs_base = __vmread(GUEST_CS_BASE);
    c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);

    c->ds_sel = __vmread(GUEST_DS_SELECTOR);
    c->ds_limit = __vmread(GUEST_DS_LIMIT);
    c->ds_base = __vmread(GUEST_DS_BASE);
    c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);

    c->es_sel = __vmread(GUEST_ES_SELECTOR);
    c->es_limit = __vmread(GUEST_ES_LIMIT);
    c->es_base = __vmread(GUEST_ES_BASE);
    c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);

    c->ss_sel = __vmread(GUEST_SS_SELECTOR);
    c->ss_limit = __vmread(GUEST_SS_LIMIT);
    c->ss_base = __vmread(GUEST_SS_BASE);
    c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);

    c->fs_sel = __vmread(GUEST_FS_SELECTOR);
    c->fs_limit = __vmread(GUEST_FS_LIMIT);
    c->fs_base = __vmread(GUEST_FS_BASE);
    c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);

    c->gs_sel = __vmread(GUEST_GS_SELECTOR);
    c->gs_limit = __vmread(GUEST_GS_LIMIT);
    c->gs_base = __vmread(GUEST_GS_BASE);
    c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);

    c->tr_sel = __vmread(GUEST_TR_SELECTOR);
    c->tr_limit = __vmread(GUEST_TR_LIMIT);
    c->tr_base = __vmread(GUEST_TR_BASE);
    c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);

    c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
    c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
    c->ldtr_base = __vmread(GUEST_LDTR_BASE);
    c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);

    c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
    c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
    c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);

    /*
     * Save any event/interrupt that was being injected when we last
     * exited. IDT_VECTORING_INFO_FIELD has priority, as anything in
     * VM_ENTRY_INTR_INFO_FIELD is either a fault caused by the first
     * event, which will happen the next time, or an interrupt, which we
     * never inject when IDT_VECTORING_INFO_FIELD is valid.
     */
    if ( (ev = __vmread(IDT_VECTORING_INFO_FIELD)) & INTR_INFO_VALID_MASK )
    {
        c->pending_event = ev;
        c->error_code = __vmread(IDT_VECTORING_ERROR_CODE);
    }
    else if ( (ev = __vmread(VM_ENTRY_INTR_INFO_FIELD)) &
              INTR_INFO_VALID_MASK )
    {
        c->pending_event = ev;
        c->error_code = __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE);
    }
    else
    {
        c->pending_event = 0;
        c->error_code = 0;
    }

    vmx_vmcs_exit(v);
}

int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
{
    unsigned long mfn, old_base_mfn;

    vmx_vmcs_enter(v);

    __vmwrite(GUEST_RIP, c->rip);
    __vmwrite(GUEST_RSP, c->rsp);
    __vmwrite(GUEST_RFLAGS, c->rflags);

    v->arch.hvm_vmx.cpu_cr0 = (c->cr0 | X86_CR0_PE | X86_CR0_PG |
                               X86_CR0_NE | X86_CR0_WP | X86_CR0_ET);
    __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
    v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
    __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);

    v->arch.hvm_vmx.cpu_cr2 = c->cr2;

    v->arch.hvm_vmx.efer = c->msr_efer;

#ifdef HVM_DEBUG_SUSPEND
    printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
           __func__, c->cr3, c->cr0, c->cr4);
#endif

    if ( !vmx_paging_enabled(v) )
    {
        HVM_DBG_LOG(DBG_LEVEL_VMMU, "%s: paging not enabled.", __func__);
        goto skip_cr3;
    }

    HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 = %"PRIx64, c->cr3);
    /* current!=vcpu as not called by arch_vmx_do_launch */
    mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
    if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
    {
        gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64".\n", c->cr3);
        vmx_vmcs_exit(v);
        return -EINVAL;
    }

    old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
    v->arch.guest_table = pagetable_from_pfn(mfn);
    if ( old_base_mfn )
        put_page(mfn_to_page(old_base_mfn));

    v->arch.hvm_vmx.cpu_cr3 = c->cr3;

 skip_cr3:
    if ( vmx_long_mode_enabled(v) )
        vmx_enable_long_mode(v);

    __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
    v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
    __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);

    __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
    __vmwrite(GUEST_IDTR_BASE, c->idtr_base);

    __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
    __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);

    __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
    __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
    __vmwrite(GUEST_CS_BASE, c->cs_base);
    __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);

    __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
    __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
    __vmwrite(GUEST_DS_BASE, c->ds_base);
    __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);

    __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
    __vmwrite(GUEST_ES_LIMIT, c->es_limit);
    __vmwrite(GUEST_ES_BASE, c->es_base);
    __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);

    __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
    __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
    __vmwrite(GUEST_SS_BASE, c->ss_base);
    __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);

    __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
    __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
    __vmwrite(GUEST_FS_BASE, c->fs_base);
    __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);

    __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
    __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
    __vmwrite(GUEST_GS_BASE, c->gs_base);
    __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);

    __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
    __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
    __vmwrite(GUEST_TR_BASE, c->tr_base);
    __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);

    __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
    __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
    __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
    __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);

    __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
    __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
    __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);

    __vmwrite(GUEST_DR7, c->dr7);

    vmx_vmcs_exit(v);

    paging_update_paging_modes(v);

    if ( c->pending_valid )
    {
        vmx_vmcs_enter(v);

        gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n",
                 c->pending_event, c->error_code);

        /* SVM uses type 3 ("Exception") for #OF and #BP; VMX uses type 6 */
        if ( (c->pending_type == 3) &&
             ((c->pending_vector == 3) || (c->pending_vector == 4)) )
            c->pending_type = 6;

        /* For software exceptions, we need to tell the hardware the
         * instruction length as well (hmmm). */
        if ( c->pending_type > 4 )
        {
            int addrbytes, ilen;
            if ( (c->cs_arbytes & X86_SEG_AR_CS_LM_ACTIVE) &&
                 (c->msr_efer & EFER_LMA) )
                addrbytes = 8;
            else if ( c->cs_arbytes & X86_SEG_AR_DEF_OP_SIZE )
                addrbytes = 4;
            else
                addrbytes = 2;

            ilen = hvm_instruction_fetch(c->rip, addrbytes, NULL);
            __vmwrite(VM_ENTRY_INSTRUCTION_LEN, ilen);
        }

        /* Sanity check */
        if ( (c->pending_type == 1) || (c->pending_type > 6) ||
             (c->pending_reserved != 0) )
        {
            gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n",
                     c->pending_event);
            return -EINVAL;
        }

        /* Re-inject the exception */
        __vmwrite(VM_ENTRY_INTR_INFO_FIELD, c->pending_event);
        __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code);
        v->arch.hvm_vmx.vector_injected = 1;

        vmx_vmcs_exit(v);
    }

    return 0;
}

#if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
static void dump_msr_state(struct vmx_msr_state *m)
{
    int i = 0;
    printk("**** msr state ****\n");
    printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
    for ( i = 0; i < VMX_MSR_COUNT; i++ )
        printk("0x%lx,", m->msrs[i]);
    printk("\n");
}
#else
#define dump_msr_state(m) ((void)0)
#endif

static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
{
#ifdef __x86_64__
    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
    unsigned long guest_flags = guest_state->flags;

    data->shadow_gs = v->arch.hvm_vmx.shadow_gs;
    data->msr_cstar = v->arch.hvm_vmx.cstar;

    /* save msrs */
    data->msr_flags        = guest_flags;
    data->msr_lstar        = guest_state->msrs[VMX_INDEX_MSR_LSTAR];
    data->msr_star         = guest_state->msrs[VMX_INDEX_MSR_STAR];
    data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK];
#endif

    data->tsc = hvm_get_guest_time(v);

    dump_msr_state(guest_state);
}

static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
{
#ifdef __x86_64__
    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;

    /* restore msrs */
    guest_state->flags = data->msr_flags;
    guest_state->msrs[VMX_INDEX_MSR_LSTAR]        = data->msr_lstar;
    guest_state->msrs[VMX_INDEX_MSR_STAR]         = data->msr_star;
    guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask;

    v->arch.hvm_vmx.cstar     = data->msr_cstar;
    v->arch.hvm_vmx.shadow_gs = data->shadow_gs;
#endif

    v->arch.hvm_vmx.vmxassist_enabled = !(data->cr0 & X86_CR0_PE);

    hvm_set_guest_time(v, data->tsc);

    dump_msr_state(guest_state);
}


static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
{
    vmx_save_cpu_state(v, ctxt);
    vmx_vmcs_save(v, ctxt);
}

static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
{
    vmx_load_cpu_state(v, ctxt);

    if ( vmx_vmcs_restore(v, ctxt) )
    {
        gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n");
        domain_crash(v->domain);
        return -EINVAL;
    }

    return 0;
}

static void vmx_ctxt_switch_from(struct vcpu *v)
{
    vmx_save_guest_msrs(v);
    vmx_restore_host_msrs();
    vmx_save_dr(v);
}

static void vmx_ctxt_switch_to(struct vcpu *v)
{
    vmx_restore_guest_msrs(v);
    vmx_restore_dr(v);
}

static void stop_vmx(void)
{
    if ( !(read_cr4() & X86_CR4_VMXE) )
        return;

    __vmxoff();
    clear_in_cr4(X86_CR4_VMXE);
}

static void vmx_store_cpu_guest_regs(
    struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs)
{
    vmx_vmcs_enter(v);

    if ( regs != NULL )
    {
        regs->eflags = __vmread(GUEST_RFLAGS);
        regs->ss = __vmread(GUEST_SS_SELECTOR);
        regs->cs = __vmread(GUEST_CS_SELECTOR);
        regs->eip = __vmread(GUEST_RIP);
        regs->esp = __vmread(GUEST_RSP);
    }

    if ( crs != NULL )
    {
        crs[0] = v->arch.hvm_vmx.cpu_shadow_cr0;
        crs[2] = v->arch.hvm_vmx.cpu_cr2;
        crs[3] = v->arch.hvm_vmx.cpu_cr3;
        crs[4] = v->arch.hvm_vmx.cpu_shadow_cr4;
    }

    vmx_vmcs_exit(v);
}

static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
{
    unsigned long base;

    vmx_vmcs_enter(v);

    __vmwrite(GUEST_SS_SELECTOR, regs->ss);
    __vmwrite(GUEST_RSP, regs->esp);

    /* NB. Bit 1 of RFLAGS must be set for VMENTRY to succeed. */
    __vmwrite(GUEST_RFLAGS, regs->eflags | 2UL);

    if ( regs->eflags & EF_VM )
    {
        /*
         * The VMX spec (section 4.3.1.2, Checks on Guest Segment
         * Registers) says that virtual-8086 mode guests' segment
         * base-address fields in the VMCS must be equal to their
         * corresponding segment selector field shifted right by
         * four bits upon vmentry.
         */
        base = __vmread(GUEST_CS_BASE);
        if ( (regs->cs << 4) != base )
            __vmwrite(GUEST_CS_BASE, regs->cs << 4);
        base = __vmread(GUEST_SS_BASE);
        if ( (regs->ss << 4) != base )
            __vmwrite(GUEST_SS_BASE, regs->ss << 4);
    }

    __vmwrite(GUEST_CS_SELECTOR, regs->cs);
    __vmwrite(GUEST_RIP, regs->eip);

    vmx_vmcs_exit(v);
}

static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
{
    switch ( num )
    {
    case 0:
        return v->arch.hvm_vmx.cpu_cr0;
    case 2:
        return v->arch.hvm_vmx.cpu_cr2;
    case 3:
        return v->arch.hvm_vmx.cpu_cr3;
    case 4:
        return v->arch.hvm_vmx.cpu_shadow_cr4;
    default:
        BUG();
    }
    return 0;                   /* dummy */
}

static unsigned long vmx_get_segment_base(struct vcpu *v, enum x86_segment seg)
{
    unsigned long base = 0;
    int long_mode = 0;

    ASSERT(v == current);

    if ( vmx_long_mode_enabled(v) &&
         (__vmread(GUEST_CS_AR_BYTES) & X86_SEG_AR_CS_LM_ACTIVE) )
        long_mode = 1;

    switch ( seg )
    {
    case x86_seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break;
    case x86_seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break;
    case x86_seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break;
    case x86_seg_fs: base = __vmread(GUEST_FS_BASE); break;
    case x86_seg_gs: base = __vmread(GUEST_GS_BASE); break;
    case x86_seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break;
    case x86_seg_tr: base = __vmread(GUEST_TR_BASE); break;
    case x86_seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break;
    case x86_seg_idtr: base = __vmread(GUEST_IDTR_BASE); break;
    case x86_seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break;
    default: BUG(); break;
    }

    return base;
}

static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
                                     struct segment_register *reg)
{
    u16 attr = 0;

    ASSERT(v == current);

    switch ( seg )
    {
    case x86_seg_cs:
        reg->sel   = __vmread(GUEST_CS_SELECTOR);
        reg->limit = __vmread(GUEST_CS_LIMIT);
        reg->base  = __vmread(GUEST_CS_BASE);
        attr       = __vmread(GUEST_CS_AR_BYTES);
        break;
    case x86_seg_ds:
        reg->sel   = __vmread(GUEST_DS_SELECTOR);
        reg->limit = __vmread(GUEST_DS_LIMIT);
        reg->base  = __vmread(GUEST_DS_BASE);
        attr       = __vmread(GUEST_DS_AR_BYTES);
        break;
    case x86_seg_es:
        reg->sel   = __vmread(GUEST_ES_SELECTOR);
        reg->limit = __vmread(GUEST_ES_LIMIT);
        reg->base  = __vmread(GUEST_ES_BASE);
        attr       = __vmread(GUEST_ES_AR_BYTES);
        break;
    case x86_seg_fs:
        reg->sel   = __vmread(GUEST_FS_SELECTOR);
        reg->limit = __vmread(GUEST_FS_LIMIT);
        reg->base  = __vmread(GUEST_FS_BASE);
        attr       = __vmread(GUEST_FS_AR_BYTES);
        break;
    case x86_seg_gs:
        reg->sel   = __vmread(GUEST_GS_SELECTOR);
        reg->limit = __vmread(GUEST_GS_LIMIT);
        reg->base  = __vmread(GUEST_GS_BASE);
        attr       = __vmread(GUEST_GS_AR_BYTES);
        break;
    case x86_seg_ss:
        reg->sel   = __vmread(GUEST_SS_SELECTOR);
        reg->limit = __vmread(GUEST_SS_LIMIT);
        reg->base  = __vmread(GUEST_SS_BASE);
        attr       = __vmread(GUEST_SS_AR_BYTES);
        break;
    case x86_seg_tr:
        reg->sel   = __vmread(GUEST_TR_SELECTOR);
        reg->limit = __vmread(GUEST_TR_LIMIT);
        reg->base  = __vmread(GUEST_TR_BASE);
        attr       = __vmread(GUEST_TR_AR_BYTES);
        break;
    case x86_seg_gdtr:
        reg->limit = __vmread(GUEST_GDTR_LIMIT);
        reg->base  = __vmread(GUEST_GDTR_BASE);
        break;
    case x86_seg_idtr:
        reg->limit = __vmread(GUEST_IDTR_LIMIT);
        reg->base  = __vmread(GUEST_IDTR_BASE);
        break;
    case x86_seg_ldtr:
        reg->sel   = __vmread(GUEST_LDTR_SELECTOR);
        reg->limit = __vmread(GUEST_LDTR_LIMIT);
        reg->base  = __vmread(GUEST_LDTR_BASE);
        attr       = __vmread(GUEST_LDTR_AR_BYTES);
        break;
    default:
        BUG();
    }

    reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00);
}

/* Make sure that xen intercepts any FP accesses from current */
static void vmx_stts(struct vcpu *v)
{
    /* VMX depends on operating on the current vcpu */
    ASSERT(v == current);

    /*
     * If the guest does not have TS enabled then we must cause and handle an
     * exception on first use of the FPU. If the guest *does* have TS enabled
     * then this is not necessary: no FPU activity can occur until the guest
     * clears CR0.TS, and we will initialise the FPU when that happens.
     */
    if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
    {
        v->arch.hvm_vmx.cpu_cr0 |= X86_CR0_TS;
        __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
        __vm_set_bit(EXCEPTION_BITMAP, TRAP_no_device);
    }
}

static void vmx_set_tsc_offset(struct vcpu *v, u64 offset)
{
    vmx_vmcs_enter(v);
    __vmwrite(TSC_OFFSET, offset);
#if defined (__i386__)
    __vmwrite(TSC_OFFSET_HIGH, offset >> 32);
#endif
    vmx_vmcs_exit(v);
}

static void vmx_init_ap_context(
    struct vcpu_guest_context *ctxt, int vcpuid, int trampoline_vector)
{
    memset(ctxt, 0, sizeof(*ctxt));
    ctxt->user_regs.eip = VMXASSIST_BASE;
    ctxt->user_regs.edx = vcpuid;
    ctxt->user_regs.ebx = trampoline_vector;
}

void do_nmi(struct cpu_user_regs *);

static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
{
    char *p;
    int i;

    for ( i = 0; i < (PAGE_SIZE / 32); i++ )
    {
        p = (char *)(hypercall_page + (i * 32));
        *(u8  *)(p + 0) = 0xb8; /* mov imm32, %eax */
        *(u32 *)(p + 1) = i;
        *(u8  *)(p + 5) = 0x0f; /* vmcall */
        *(u8  *)(p + 6) = 0x01;
        *(u8  *)(p + 7) = 0xc1;
        *(u8  *)(p + 8) = 0xc3; /* ret */
    }

    /* Don't support HYPERVISOR_iret at the moment */
    *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
}

static int vmx_interrupts_enabled(struct vcpu *v) 
{
    unsigned long eflags = __vmread(GUEST_RFLAGS); 
    return !irq_masked(eflags); 
}


static void vmx_update_host_cr3(struct vcpu *v)
{
    ASSERT( (v == current) || !vcpu_runnable(v) );
    vmx_vmcs_enter(v);
    __vmwrite(HOST_CR3, v->arch.cr3);
    vmx_vmcs_exit(v);
}

static void vmx_update_guest_cr3(struct vcpu *v)
{
    ASSERT( (v == current) || !vcpu_runnable(v) );
    vmx_vmcs_enter(v);
    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
    vmx_vmcs_exit(v);
}

static void vmx_flush_guest_tlbs(void)
{
    /* No tagged TLB support on VMX yet.  The fact that we're in Xen
     * at all means any guest will have a clean TLB when it's next run,
     * because VMRESUME will flush it for us. */
}

static void vmx_inject_exception(
    unsigned int trapnr, int errcode, unsigned long cr2)
{
    struct vcpu *v = current;
    vmx_inject_hw_exception(v, trapnr, errcode);
    if ( trapnr == TRAP_page_fault )
        v->arch.hvm_vmx.cpu_cr2 = cr2;
}

static int vmx_event_injection_faulted(struct vcpu *v)
{
    unsigned int idtv_info_field;

    ASSERT(v == current);

    idtv_info_field = __vmread(IDT_VECTORING_INFO_FIELD);
    return (idtv_info_field & INTR_INFO_VALID_MASK);
}

static void disable_intercept_for_msr(u32 msr)
{
    /*
     * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
     * have the write-low and read-high bitmap offsets the wrong way round.
     * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
     */
    if ( msr <= 0x1fff )
    {
        __clear_bit(msr, vmx_msr_bitmap + 0x000); /* read-low */
        __clear_bit(msr, vmx_msr_bitmap + 0x800); /* write-low */
    }
    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
    {
        msr &= 0x1fff;
        __clear_bit(msr, vmx_msr_bitmap + 0x400); /* read-high */
        __clear_bit(msr, vmx_msr_bitmap + 0xc00); /* write-high */
    }
}

static struct hvm_function_table vmx_function_table = {
    .name                 = "VMX",
    .disable              = stop_vmx,
    .domain_initialise    = vmx_domain_initialise,
    .domain_destroy       = vmx_domain_destroy,
    .vcpu_initialise      = vmx_vcpu_initialise,
    .vcpu_destroy         = vmx_vcpu_destroy,
    .store_cpu_guest_regs = vmx_store_cpu_guest_regs,
    .load_cpu_guest_regs  = vmx_load_cpu_guest_regs,
    .save_cpu_ctxt        = vmx_save_vmcs_ctxt,
    .load_cpu_ctxt        = vmx_load_vmcs_ctxt,
    .paging_enabled       = vmx_paging_enabled,
    .long_mode_enabled    = vmx_long_mode_enabled,
    .pae_enabled          = vmx_pae_enabled,
    .nx_enabled           = vmx_nx_enabled,
    .interrupts_enabled   = vmx_interrupts_enabled,
    .guest_x86_mode       = vmx_guest_x86_mode,
    .get_guest_ctrl_reg   = vmx_get_ctrl_reg,
    .get_segment_base     = vmx_get_segment_base,
    .get_segment_register = vmx_get_segment_register,
    .update_host_cr3      = vmx_update_host_cr3,
    .update_guest_cr3     = vmx_update_guest_cr3,
    .flush_guest_tlbs     = vmx_flush_guest_tlbs,
    .stts                 = vmx_stts,
    .set_tsc_offset       = vmx_set_tsc_offset,
    .inject_exception     = vmx_inject_exception,
    .init_ap_context      = vmx_init_ap_context,
    .init_hypercall_page  = vmx_init_hypercall_page,
    .event_injection_faulted = vmx_event_injection_faulted
};

int start_vmx(void)
{
    u32 eax, edx;
    struct vmcs_struct *vmcs;

    /*
     * Xen does not fill x86_capability words except 0.
     */
    boot_cpu_data.x86_capability[4] = cpuid_ecx(1);

    if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
        return 0;

    rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);

    if ( eax & IA32_FEATURE_CONTROL_MSR_LOCK )
    {
        if ( (eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0 )
        {
            printk("VMX disabled by Feature Control MSR.\n");
            return 0;
        }
    }
    else
    {
        wrmsr(IA32_FEATURE_CONTROL_MSR,
              IA32_FEATURE_CONTROL_MSR_LOCK |
              IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
    }

    set_in_cr4(X86_CR4_VMXE);

    vmx_init_vmcs_config();

    if ( smp_processor_id() == 0 )
        setup_vmcs_dump();

    if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
    {
        clear_in_cr4(X86_CR4_VMXE);
        printk("Failed to allocate host VMCS\n");
        return 0;
    }

    if ( __vmxon(virt_to_maddr(vmcs)) )
    {
        clear_in_cr4(X86_CR4_VMXE);
        printk("VMXON failed\n");
        vmx_free_host_vmcs(vmcs);
        return 0;
    }

    vmx_save_host_msrs();

    if ( smp_processor_id() != 0 )
        return 1;

    hvm_enable(&vmx_function_table);

    if ( cpu_has_vmx_msr_bitmap )
    {
        printk("VMX: MSR intercept bitmap enabled\n");
        vmx_msr_bitmap = alloc_xenheap_page();
        BUG_ON(vmx_msr_bitmap == NULL);
        memset(vmx_msr_bitmap, ~0, PAGE_SIZE);

        disable_intercept_for_msr(MSR_FS_BASE);
        disable_intercept_for_msr(MSR_GS_BASE);

        disable_intercept_for_msr(MSR_IA32_SYSENTER_CS);
        disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP);
        disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP);
    }

    return 1;
}

/*
 * Not all cases receive valid value in the VM-exit instruction length field.
 * Callers must know what they're doing!
 */
static int __get_instruction_length(void)
{
    int len;
    len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
    BUG_ON((len < 1) || (len > 15));
    return len;
}

static void __update_guest_eip(unsigned long inst_len)
{
    unsigned long x;

    x = __vmread(GUEST_RIP);
    __vmwrite(GUEST_RIP, x + inst_len);

    x = __vmread(GUEST_RFLAGS);
    if ( x & X86_EFLAGS_RF )
        __vmwrite(GUEST_RFLAGS, x & ~X86_EFLAGS_RF);

    x = __vmread(GUEST_INTERRUPTIBILITY_INFO);
    if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
    {
        x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS);
        __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x);
    }
}

static void vmx_do_no_device_fault(void)
{
    struct vcpu *v = current;

    setup_fpu(current);
    __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);

    /* Disable TS in guest CR0 unless the guest wants the exception too. */
    if ( !(v->arch.hvm_vmx.cpu_shadow_cr0 & X86_CR0_TS) )
    {
        v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS;
        __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);
    }
}

#define bitmaskof(idx)  (1U << ((idx) & 31))
static void vmx_do_cpuid(struct cpu_user_regs *regs)
{
    unsigned int input = (unsigned int)regs->eax;
    unsigned int count = (unsigned int)regs->ecx;
    unsigned int eax, ebx, ecx, edx;

    if ( input == 0x00000004 )
    {
        cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
        eax &= NUM_CORES_RESET_MASK;
    }
    else if ( input == 0x40000003 )
    {
        /*
         * NB. Unsupported interface for private use of VMXASSIST only.
         * Note that this leaf lives at <max-hypervisor-leaf> + 1.
         */
        u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
        unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
        struct vcpu *v = current;
        char *p;

        gdprintk(XENLOG_INFO, "Input address is 0x%"PRIx64".\n", value);

        /* 8-byte aligned valid pseudophys address from vmxassist, please. */
        if ( (value & 7) || (mfn == INVALID_MFN) ||
             !v->arch.hvm_vmx.vmxassist_enabled )
        {
            domain_crash(v->domain);
            return;
        }

        p = map_domain_page(mfn);
        value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
        unmap_domain_page(p);

        gdprintk(XENLOG_INFO, "Output value is 0x%"PRIx64".\n", value);
        ecx = (u32)value;
        edx = (u32)(value >> 32);
    } else {
        hvm_cpuid(input, &eax, &ebx, &ecx, &edx);

        if ( input == 0x00000001 )
        {
            /* Mask off reserved bits. */
            ecx &= ~VMX_VCPU_CPUID_L1_ECX_RESERVED;

            ebx &= NUM_THREADS_RESET_MASK;

            /* Unsupportable for virtualised CPUs. */
            ecx &= ~(bitmaskof(X86_FEATURE_VMXE) |
                     bitmaskof(X86_FEATURE_EST)  |
                     bitmaskof(X86_FEATURE_TM2)  |
                     bitmaskof(X86_FEATURE_CID));

            edx &= ~(bitmaskof(X86_FEATURE_HT)   |
                     bitmaskof(X86_FEATURE_ACPI) |
                     bitmaskof(X86_FEATURE_ACC));
        }

        if ( input == 0x80000001 )
        {
            /* Only a few features are advertised in Intel's 0x80000001. */
            ecx &= (bitmaskof(X86_FEATURE_LAHF_LM));
            edx &= (bitmaskof(X86_FEATURE_NX) |
                    bitmaskof(X86_FEATURE_LM) |
                    bitmaskof(X86_FEATURE_SYSCALL));
        }

        if ( input == 0x00000006 || input == 0x00000009 || input == 0x0000000A )
            eax = ebx = ecx = edx = 0x0;
    }

    regs->eax = (unsigned long)eax;
    regs->ebx = (unsigned long)ebx;
    regs->ecx = (unsigned long)ecx;
    regs->edx = (unsigned long)edx;

    HVMTRACE_3D(CPUID, current, input,
                ((uint64_t)eax << 32) | ebx, ((uint64_t)ecx << 32) | edx);
}

#define CASE_GET_REG_P(REG, reg)    \
    case REG_ ## REG: reg_p = (unsigned long *)&(regs->reg); break

#ifdef __i386__
#define CASE_EXTEND_GET_REG_P
#else
#define CASE_EXTEND_GET_REG_P       \
    CASE_GET_REG_P(R8, r8);         \
    CASE_GET_REG_P(R9, r9);         \
    CASE_GET_REG_P(R10, r10);       \
    CASE_GET_REG_P(R11, r11);       \
    CASE_GET_REG_P(R12, r12);       \
    CASE_GET_REG_P(R13, r13);       \
    CASE_GET_REG_P(R14, r14);       \
    CASE_GET_REG_P(R15, r15)
#endif

static void vmx_dr_access(unsigned long exit_qualification,
                          struct cpu_user_regs *regs)
{
    struct vcpu *v = current;

    HVMTRACE_0D(DR_WRITE, v);

    if ( !v->arch.hvm_vcpu.flag_dr_dirty )
        __restore_debug_registers(v);

    /* Allow guest direct access to DR registers */
    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING;
    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
}

/*
 * Invalidate the TLB for va. Invalidate the shadow page corresponding
 * the address va.
 */
static void vmx_do_invlpg(unsigned long va)
{
    unsigned long eip;
    struct vcpu *v = current;

    HVMTRACE_2D(INVLPG, v, /*invlpga=*/ 0, va);

    eip = __vmread(GUEST_RIP);

    HVM_DBG_LOG(DBG_LEVEL_VMMU, "eip=%lx, va=%lx",
                eip, va);

    /*
     * We do the safest things first, then try to update the shadow
     * copying from guest
     */
    paging_invlpg(v, va);
}

/*
 * get segment for string pio according to guest instruction
 */
static void vmx_str_pio_get_segment(int long_mode, unsigned long eip,
                                   int inst_len, enum x86_segment *seg)
{
    unsigned char inst[MAX_INST_LEN];
    int i;
    extern int inst_copy_from_guest(unsigned char *, unsigned long, int);

    if ( !long_mode )
        eip += __vmread(GUEST_CS_BASE);

    memset(inst, 0, MAX_INST_LEN);
    if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len )
    {
        gdprintk(XENLOG_ERR, "Get guest instruction failed\n");
        domain_crash(current->domain);
        return;
    }

    for ( i = 0; i < inst_len; i++ )
    {
        switch ( inst[i] )
        {
        case 0xf3: /* REPZ */
        case 0xf2: /* REPNZ */
        case 0xf0: /* LOCK */
        case 0x66: /* data32 */
        case 0x67: /* addr32 */
#ifdef __x86_64__
        case 0x40 ... 0x4f: /* REX */
#endif
            continue;
        case 0x2e: /* CS */
            *seg = x86_seg_cs;
            continue;
        case 0x36: /* SS */
            *seg = x86_seg_ss;
            continue;
        case 0x26: /* ES */
            *seg = x86_seg_es;
            continue;
        case 0x64: /* FS */
            *seg = x86_seg_fs;
            continue;
        case 0x65: /* GS */
            *seg = x86_seg_gs;
            continue;
        case 0x3e: /* DS */
            *seg = x86_seg_ds;
            continue;
        }
    }
}

static int vmx_str_pio_check_descriptor(int long_mode, unsigned long eip,
                                        int inst_len, enum x86_segment seg,
                                        unsigned long *base, u32 *limit,
                                        u32 *ar_bytes)
{
    enum vmcs_field ar_field, base_field, limit_field;

    *base = 0;
    *limit = 0;
    if ( seg != x86_seg_es )
        vmx_str_pio_get_segment(long_mode, eip, inst_len, &seg);

    switch ( seg )
    {
    case x86_seg_cs:
        ar_field = GUEST_CS_AR_BYTES;
        base_field = GUEST_CS_BASE;
        limit_field = GUEST_CS_LIMIT;
        break;
    case x86_seg_ds:
        ar_field = GUEST_DS_AR_BYTES;
        base_field = GUEST_DS_BASE;
        limit_field = GUEST_DS_LIMIT;
        break;
    case x86_seg_es:
        ar_field = GUEST_ES_AR_BYTES;
        base_field = GUEST_ES_BASE;
        limit_field = GUEST_ES_LIMIT;
        break;
    case x86_seg_fs:
        ar_field = GUEST_FS_AR_BYTES;
        base_field = GUEST_FS_BASE;
        limit_field = GUEST_FS_LIMIT;
        break;
    case x86_seg_gs:
        ar_field = GUEST_GS_AR_BYTES;
        base_field = GUEST_GS_BASE;
        limit_field = GUEST_GS_LIMIT;
        break;
    case x86_seg_ss:
        ar_field = GUEST_SS_AR_BYTES;
        base_field = GUEST_SS_BASE;
        limit_field = GUEST_SS_LIMIT;
        break;
    default:
        BUG();
        return 0;
    }

    if ( !long_mode || seg == x86_seg_fs || seg == x86_seg_gs )
    {
        *base = __vmread(base_field);
        *limit = __vmread(limit_field);
    }
    *ar_bytes = __vmread(ar_field);

    return !(*ar_bytes & 0x10000);
}


static void vmx_str_pio_check_limit(u32 limit, unsigned int size,
                                    u32 ar_bytes, unsigned long addr,
                                    unsigned long base, int df,
                                    unsigned long *count)
{
    unsigned long ea = addr - base;

    /* Offset must be within limits. */
    ASSERT(ea == (u32)ea);
    if ( (u32)(ea + size - 1) < (u32)ea ||
         (ar_bytes & 0xc) != 0x4 ? ea + size - 1 > limit
                                 : ea <= limit )
    {
        vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
        return;
    }

    /* Check the limit for repeated instructions, as above we checked
       only the first instance. Truncate the count if a limit violation
       would occur. Note that the checking is not necessary for page
       granular segments as transfers crossing page boundaries will be
       broken up anyway. */
    if ( !(ar_bytes & X86_SEG_AR_GRANULARITY) && *count > 1 )
    {
        if ( (ar_bytes & 0xc) != 0x4 )
        {
            /* expand-up */
            if ( !df )
            {
                if ( ea + *count * size - 1 < ea ||
                     ea + *count * size - 1 > limit )
                    *count = (limit + 1UL - ea) / size;
            }
            else
            {
                if ( *count - 1 > ea / size )
                    *count = ea / size + 1;
            }
        }
        else
        {
            /* expand-down */
            if ( !df )
            {
                if ( *count - 1 > -(s32)ea / size )
                    *count = -(s32)ea / size + 1UL;
            }
            else
            {
                if ( ea < (*count - 1) * size ||
                     ea - (*count - 1) * size <= limit )
                    *count = (ea - limit - 1) / size + 1;
            }
        }
        ASSERT(*count);
    }
}

#ifdef __x86_64__
static void vmx_str_pio_lm_check_limit(struct cpu_user_regs *regs,
                                       unsigned int size,
                                       unsigned long addr,
                                       unsigned long *count)
{
    if ( !is_canonical_address(addr) ||
         !is_canonical_address(addr + size - 1) )
    {
        vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
        return;
    }
    if ( *count > (1UL << 48) / size )
        *count = (1UL << 48) / size;
    if ( !(regs->eflags & EF_DF) )
    {
        if ( addr + *count * size - 1 < addr ||
             !is_canonical_address(addr + *count * size - 1) )
            *count = (addr & ~((1UL << 48) - 1)) / size;
    }
    else
    {
        if ( (*count - 1) * size > addr ||
             !is_canonical_address(addr + (*count - 1) * size) )
            *count = (addr & ~((1UL << 48) - 1)) / size + 1;
    }
    ASSERT(*count);
}
#endif

static void vmx_send_str_pio(struct cpu_user_regs *regs,
                             struct hvm_io_op *pio_opp,
                             unsigned long inst_len, unsigned int port,
                             int sign, unsigned int size, int dir,
                             int df, unsigned long addr,
                             paddr_t paddr, unsigned long count)
{
    /*
     * Handle string pio instructions that cross pages or that
     * are unaligned. See the comments in hvm_domain.c/handle_mmio()
     */
    if ( (addr & PAGE_MASK) != ((addr + size - 1) & PAGE_MASK) ) {
        unsigned long value = 0;

        pio_opp->flags |= OVERLAP;

        if ( dir == IOREQ_WRITE )   /* OUTS */
        {
            if ( hvm_paging_enabled(current) )
            {
                int rv, todo;
                rv = hvm_copy_from_guest_virt(&value, addr, size, &todo);
                if ( rv == HVMCOPY_bad_gva_to_gfn )
                {
                    /* Failed on the page-spanning copy.  Inject PF into
                     * the guest for the address where we failed. */
                    addr += size - todo;
                    gdprintk(XENLOG_DEBUG, "Pagefault reading non-io side "
                             "of a page-spanning PIO: va=%#lx\n", addr);
                    vmx_inject_exception(TRAP_page_fault, 0, addr);
                    return;
                }
            }
            else
                (void) hvm_copy_from_guest_phys(&value, addr, size);
        } else /* dir != IOREQ_WRITE */
            /* Remember where to write the result, as a *VA*.
             * Must be a VA so we can handle the page overlap
             * correctly in hvm_pio_assist() */
            pio_opp->addr = addr;

        if ( count == 1 )
            regs->eip += inst_len;

        send_pio_req(port, 1, size, value, dir, df, 0);
    } else {
        unsigned long last_addr = sign > 0 ? addr + count * size - 1
                                           : addr - (count - 1) * size;

        if ( (addr & PAGE_MASK) != (last_addr & PAGE_MASK) )
        {
            if ( sign > 0 )
                count = (PAGE_SIZE - (addr & ~PAGE_MASK)) / size;
            else
                count = (addr & ~PAGE_MASK) / size + 1;
        } else
            regs->eip += inst_len;

        send_pio_req(port, count, size, paddr, dir, df, 1);
    }
}

static void vmx_do_str_pio(unsigned long exit_qualification,
                           unsigned long inst_len,
                           struct cpu_user_regs *regs,
                           struct hvm_io_op *pio_opp)
{
    unsigned int port, size;
    int dir, df, vm86;
    unsigned long addr, count = 1, base;
    paddr_t paddr;
    unsigned long gfn;
    u32 ar_bytes, limit;
    int sign;
    int long_mode = 0;

    vm86 = regs->eflags & X86_EFLAGS_VM ? 1 : 0;
    df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;

    if ( test_bit(6, &exit_qualification) )
        port = (exit_qualification >> 16) & 0xFFFF;
    else
        port = regs->edx & 0xffff;

    size = (exit_qualification & 7) + 1;
    dir = test_bit(3, &exit_qualification); /* direction */

    if ( dir == IOREQ_READ )
        HVMTRACE_2D(IO_READ,  current, port, size);
    else
        HVMTRACE_2D(IO_WRITE, current, port, size);

    sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1;
    ar_bytes = __vmread(GUEST_CS_AR_BYTES);
    if ( vmx_long_mode_enabled(current) &&
         (ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) )
        long_mode = 1;
    addr = __vmread(GUEST_LINEAR_ADDRESS);

    if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */
        pio_opp->flags |= REPZ;
        count = regs->ecx;
        if ( !long_mode &&
            (vm86 || !(ar_bytes & X86_SEG_AR_DEF_OP_SIZE)) )
            count &= 0xFFFF;
    }

    /*
     * In protected mode, guest linear address is invalid if the
     * selector is null.
     */
    if ( !vmx_str_pio_check_descriptor(long_mode, regs->eip, inst_len,
                                       dir==IOREQ_WRITE ? x86_seg_ds :
                                       x86_seg_es, &base, &limit,
                                       &ar_bytes) ) {
        if ( !long_mode ) {
            vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
            return;
        }
        addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi;
    }

    if ( !long_mode )
    {
        /* Segment must be readable for outs and writeable for ins. */
        if ( dir == IOREQ_WRITE ? (ar_bytes & 0xa) == 0x8
                                : (ar_bytes & 0xa) != 0x2 ) {
            vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
            return;
        }

        vmx_str_pio_check_limit(limit, size, ar_bytes, addr, base, df, &count);
    }
#ifdef __x86_64__
    else
    {
        vmx_str_pio_lm_check_limit(regs, size, addr, &count);
    }
#endif

    /* Translate the address to a physical address */
    gfn = paging_gva_to_gfn(current, addr);
    if ( gfn == INVALID_GFN )
    {
        /* The guest does not have the RAM address mapped.
         * Need to send in a page fault */
        int errcode = 0;
        /* IO read --> memory write */
        if ( dir == IOREQ_READ ) errcode |= PFEC_write_access;
        vmx_inject_exception(TRAP_page_fault, errcode, addr);
        return;
    }
    paddr = (paddr_t)gfn << PAGE_SHIFT | (addr & ~PAGE_MASK);

    vmx_send_str_pio(regs, pio_opp, inst_len, port, sign,
                     size, dir, df, addr, paddr, count);
}

static void vmx_io_instruction(unsigned long exit_qualification,
                               unsigned long inst_len)
{
    struct cpu_user_regs *regs;
    struct hvm_io_op *pio_opp;

    pio_opp = &current->arch.hvm_vcpu.io_op;
    pio_opp->instr = INSTR_PIO;
    pio_opp->flags = 0;

    regs = &pio_opp->io_context;

    /* Copy current guest state into io instruction state structure. */
    memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
    hvm_store_cpu_guest_regs(current, regs, NULL);

    HVM_DBG_LOG(DBG_LEVEL_IO, "vm86 %d, eip=%x:%lx, "
                "exit_qualification = %lx",
                regs->eflags & X86_EFLAGS_VM ? 1 : 0,
                regs->cs, (unsigned long)regs->eip, exit_qualification);

    if ( test_bit(4, &exit_qualification) ) /* string instrucation */
        vmx_do_str_pio(exit_qualification, inst_len, regs, pio_opp);
    else
    {
        unsigned int port, size;
        int dir, df;

        df = regs->eflags & X86_EFLAGS_DF ? 1 : 0;

        if ( test_bit(6, &exit_qualification) )
            port = (exit_qualification >> 16) & 0xFFFF;
        else
            port = regs->edx & 0xffff;

        size = (exit_qualification & 7) + 1;
        dir = test_bit(3, &exit_qualification); /* direction */

        if ( dir == IOREQ_READ )
            HVMTRACE_2D(IO_READ,  current, port, size);
        else
            HVMTRACE_2D(IO_WRITE, current, port, size);

        if ( port == 0xe9 && dir == IOREQ_WRITE && size == 1 )
            hvm_print_line(current, regs->eax); /* guest debug output */

        regs->eip += inst_len;
        send_pio_req(port, 1, size, regs->eax, dir, df, 0);
    }
}

static void vmx_world_save(struct vcpu *v, struct vmx_assist_context *c)
{
    /* NB. Skip transition instruction. */
    c->eip = __vmread(GUEST_RIP);
    c->eip += __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */

    c->esp = __vmread(GUEST_RSP);
    c->eflags = __vmread(GUEST_RFLAGS) & ~X86_EFLAGS_RF;

    c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
    c->cr3 = v->arch.hvm_vmx.cpu_cr3;
    c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;

    c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
    c->idtr_base = __vmread(GUEST_IDTR_BASE);

    c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
    c->gdtr_base = __vmread(GUEST_GDTR_BASE);

    c->cs_sel = __vmread(GUEST_CS_SELECTOR);
    c->cs_limit = __vmread(GUEST_CS_LIMIT);
    c->cs_base = __vmread(GUEST_CS_BASE);
    c->cs_arbytes.bytes = __vmread(GUEST_CS_AR_BYTES);

    c->ds_sel = __vmread(GUEST_DS_SELECTOR);
    c->ds_limit = __vmread(GUEST_DS_LIMIT);
    c->ds_base = __vmread(GUEST_DS_BASE);
    c->ds_arbytes.bytes = __vmread(GUEST_DS_AR_BYTES);

    c->es_sel = __vmread(GUEST_ES_SELECTOR);
    c->es_limit = __vmread(GUEST_ES_LIMIT);
    c->es_base = __vmread(GUEST_ES_BASE);
    c->es_arbytes.bytes = __vmread(GUEST_ES_AR_BYTES);

    c->ss_sel = __vmread(GUEST_SS_SELECTOR);
    c->ss_limit = __vmread(GUEST_SS_LIMIT);
    c->ss_base = __vmread(GUEST_SS_BASE);
    c->ss_arbytes.bytes = __vmread(GUEST_SS_AR_BYTES);

    c->fs_sel = __vmread(GUEST_FS_SELECTOR);
    c->fs_limit = __vmread(GUEST_FS_LIMIT);
    c->fs_base = __vmread(GUEST_FS_BASE);
    c->fs_arbytes.bytes = __vmread(GUEST_FS_AR_BYTES);

    c->gs_sel = __vmread(GUEST_GS_SELECTOR);
    c->gs_limit = __vmread(GUEST_GS_LIMIT);
    c->gs_base = __vmread(GUEST_GS_BASE);
    c->gs_arbytes.bytes = __vmread(GUEST_GS_AR_BYTES);

    c->tr_sel = __vmread(GUEST_TR_SELECTOR);
    c->tr_limit = __vmread(GUEST_TR_LIMIT);
    c->tr_base = __vmread(GUEST_TR_BASE);
    c->tr_arbytes.bytes = __vmread(GUEST_TR_AR_BYTES);

    c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
    c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
    c->ldtr_base = __vmread(GUEST_LDTR_BASE);
    c->ldtr_arbytes.bytes = __vmread(GUEST_LDTR_AR_BYTES);
}

static int vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
{
    unsigned long mfn, old_base_mfn;

    __vmwrite(GUEST_RIP, c->eip);
    __vmwrite(GUEST_RSP, c->esp);
    __vmwrite(GUEST_RFLAGS, c->eflags);

    v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
    __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);

    if ( !vmx_paging_enabled(v) )
        goto skip_cr3;

    if ( c->cr3 == v->arch.hvm_vmx.cpu_cr3 )
    {
        /*
         * This is simple TLB flush, implying the guest has
         * removed some translation or changed page attributes.
         * We simply invalidate the shadow.
         */
        mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
        if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
            goto bad_cr3;
    }
    else
    {
        /*
         * If different, make a shadow. Check if the PDBR is valid
         * first.
         */
        HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
        mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
        if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
            goto bad_cr3;
        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
        v->arch.guest_table = pagetable_from_pfn(mfn);
        if ( old_base_mfn )
             put_page(mfn_to_page(old_base_mfn));
        v->arch.hvm_vmx.cpu_cr3 = c->cr3;
    }

 skip_cr3:
    if ( !vmx_paging_enabled(v) )
        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
    else
        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);

    __vmwrite(GUEST_CR4, (c->cr4 | HVM_CR4_HOST_MASK));
    v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
    __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);

    __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
    __vmwrite(GUEST_IDTR_BASE, c->idtr_base);

    __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
    __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);

    __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
    __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
    __vmwrite(GUEST_CS_BASE, c->cs_base);
    __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);

    __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
    __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
    __vmwrite(GUEST_DS_BASE, c->ds_base);
    __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);

    __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
    __vmwrite(GUEST_ES_LIMIT, c->es_limit);
    __vmwrite(GUEST_ES_BASE, c->es_base);
    __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);

    __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
    __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
    __vmwrite(GUEST_SS_BASE, c->ss_base);
    __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);

    __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
    __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
    __vmwrite(GUEST_FS_BASE, c->fs_base);
    __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);

    __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
    __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
    __vmwrite(GUEST_GS_BASE, c->gs_base);
    __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);

    __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
    __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
    __vmwrite(GUEST_TR_BASE, c->tr_base);
    __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);

    __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
    __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
    __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
    __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);

    paging_update_paging_modes(v);
    return 0;

 bad_cr3:
    gdprintk(XENLOG_ERR, "Invalid CR3 value=%x", c->cr3);
    return -EINVAL;
}

enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };

static int vmx_assist(struct vcpu *v, int mode)
{
    struct vmx_assist_context c;
    struct hvm_hw_vpic *vpic = v->domain->arch.hvm_domain.vpic;
    u32 magic, cp;

    /* make sure vmxassist exists (this is not an error) */
    if ( hvm_copy_from_guest_phys(&magic, VMXASSIST_MAGIC_OFFSET,
                                  sizeof(magic)) )
        return 0;
    if ( magic != VMXASSIST_MAGIC )
        return 0;

    switch ( mode ) {
        /*
         * Transfer control to vmxassist.
         * Store the current context in VMXASSIST_OLD_CONTEXT and load
         * the new VMXASSIST_NEW_CONTEXT context. This context was created
         * by vmxassist and will transfer control to it.
         */
    case VMX_ASSIST_INVOKE:
        /* save the old context */
        if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
            goto error;
        if ( cp != 0 ) {
            vmx_world_save(v, &c);
            if ( hvm_copy_to_guest_phys(cp, &c, sizeof(c)) )
                goto error;
        }

        /* restore the new context, this should activate vmxassist */
        if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp)) )
            goto error;
        if ( cp != 0 ) {
            if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
                goto error;
            if ( vmx_world_restore(v, &c) != 0 )
                goto error;
            v->arch.hvm_vmx.pm_irqbase[0] = vpic[0].irq_base;
            v->arch.hvm_vmx.pm_irqbase[1] = vpic[1].irq_base;
            vpic[0].irq_base = NR_EXCEPTION_HANDLER;
            vpic[1].irq_base = NR_EXCEPTION_HANDLER + 8;
            v->arch.hvm_vmx.vmxassist_enabled = 1;
            return 1;
        }
        break;

        /*
         * Restore the VMXASSIST_OLD_CONTEXT that was saved by
         * VMX_ASSIST_INVOKE above.
         */
    case VMX_ASSIST_RESTORE:
        /* save the old context */
        if ( hvm_copy_from_guest_phys(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp)) )
            goto error;
        if ( cp != 0 ) {
            if ( hvm_copy_from_guest_phys(&c, cp, sizeof(c)) )
                goto error;
            if ( vmx_world_restore(v, &c) != 0 )
                goto error;
            if ( v->arch.hvm_vmx.irqbase_mode ) {
                vpic[0].irq_base = c.rm_irqbase[0] & 0xf8;
                vpic[1].irq_base = c.rm_irqbase[1] & 0xf8;
            } else {
                vpic[0].irq_base = v->arch.hvm_vmx.pm_irqbase[0];
                vpic[1].irq_base = v->arch.hvm_vmx.pm_irqbase[1];
            }
            v->arch.hvm_vmx.vmxassist_enabled = 0;
            return 1;
        }
        break;
    }

 error:
    gdprintk(XENLOG_ERR, "Failed to transfer to vmxassist\n");
    domain_crash(v->domain);
    return 0;
}

static int vmx_set_cr0(unsigned long value)
{
    struct vcpu *v = current;
    unsigned long mfn;
    unsigned long eip;
    int paging_enabled;
    unsigned long old_cr0;
    unsigned long old_base_mfn;

    HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);

    /* ET is reserved and should be always be 1. */
    value |= X86_CR0_ET;

    if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG )
    {
        vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
        return 0;
    }

    /* TS cleared? Then initialise FPU now. */
    if ( !(value & X86_CR0_TS) )
    {
        setup_fpu(v);
        __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);
    }

    old_cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
    paging_enabled = old_cr0 & X86_CR0_PG;

    v->arch.hvm_vmx.cpu_cr0 = (value | X86_CR0_PE | X86_CR0_PG
                               | X86_CR0_NE | X86_CR0_WP);
    __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);

    v->arch.hvm_vmx.cpu_shadow_cr0 = value;
    __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);

    /* Trying to enable paging. */
    if ( (value & X86_CR0_PE) && (value & X86_CR0_PG) && !paging_enabled )
    {
        if ( vmx_lme_is_set(v) && !vmx_long_mode_enabled(v) )
        {
            if ( !(v->arch.hvm_vmx.cpu_shadow_cr4 & X86_CR4_PAE) )
            {
                HVM_DBG_LOG(DBG_LEVEL_1, "Guest enabled paging "
                            "with EFER.LME set but not CR4.PAE");
                vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
                return 0;
            }

            HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode");
            vmx_enable_long_mode(v);
        }

        /*
         * The guest CR3 must be pointing to the guest physical.
         */
        mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
        if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
        {
            gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx (mfn=%lx)\n",
                     v->arch.hvm_vmx.cpu_cr3, mfn);
            domain_crash(v->domain);
            return 0;
        }

        /*
         * Now arch.guest_table points to machine physical.
         */
        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
        v->arch.guest_table = pagetable_from_pfn(mfn);
        if ( old_base_mfn )
            put_page(mfn_to_page(old_base_mfn));

        paging_update_paging_modes(v);

        HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                    (unsigned long) (mfn << PAGE_SHIFT));

        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
                    v->arch.hvm_vmx.cpu_cr3, mfn);
    }

    /* Trying to disable paging. */
    if ( ((value & (X86_CR0_PE | X86_CR0_PG)) != (X86_CR0_PE | X86_CR0_PG)) &&
         paging_enabled )
    {
        if ( v->arch.hvm_vmx.cpu_cr3 )
        {
            put_page(mfn_to_page(get_mfn_from_gpfn(
                      v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)));
            v->arch.guest_table = pagetable_null();
        }

        if ( vmx_long_mode_enabled(v) )
            vmx_disable_long_mode(v);
    }

    /*
     * VMX does not implement real-mode virtualization. We emulate
     * real-mode by performing a world switch to VMXAssist whenever
     * a partition disables the CR0.PE bit.
     */
    if ( (value & X86_CR0_PE) == 0 )
    {
        if ( value & X86_CR0_PG )
        {
            vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
            return 0;
        }

        if ( vmx_assist(v, VMX_ASSIST_INVOKE) )
        {
            eip = __vmread(GUEST_RIP);
            HVM_DBG_LOG(DBG_LEVEL_1,
                        "Transfering control to vmxassist %%eip 0x%lx", eip);
            return 0; /* do not update eip! */
        }
    }
    else if ( v->arch.hvm_vmx.vmxassist_enabled )
    {
        eip = __vmread(GUEST_RIP);
        HVM_DBG_LOG(DBG_LEVEL_1,
                    "Enabling CR0.PE at %%eip 0x%lx", eip);
        if ( vmx_assist(v, VMX_ASSIST_RESTORE) )
        {
            HVM_DBG_LOG(DBG_LEVEL_1,
                        "Restoring to %%eip 0x%lx", eip);
            return 0; /* do not update eip! */
        }
    }
    else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
        paging_update_paging_modes(v);

    return 1;
}

#define CASE_SET_REG(REG, reg)      \
    case REG_ ## REG: regs->reg = value; break
#define CASE_GET_REG(REG, reg)      \
    case REG_ ## REG: value = regs->reg; break

#define CASE_EXTEND_SET_REG         \
    CASE_EXTEND_REG(S)
#define CASE_EXTEND_GET_REG         \
    CASE_EXTEND_REG(G)

#ifdef __i386__
#define CASE_EXTEND_REG(T)
#else
#define CASE_EXTEND_REG(T)          \
    CASE_ ## T ## ET_REG(R8, r8);   \
    CASE_ ## T ## ET_REG(R9, r9);   \
    CASE_ ## T ## ET_REG(R10, r10); \
    CASE_ ## T ## ET_REG(R11, r11); \
    CASE_ ## T ## ET_REG(R12, r12); \
    CASE_ ## T ## ET_REG(R13, r13); \
    CASE_ ## T ## ET_REG(R14, r14); \
    CASE_ ## T ## ET_REG(R15, r15)
#endif

/*
 * Write to control registers
 */
static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
{
    unsigned long value, old_cr, old_base_mfn, mfn;
    struct vcpu *v = current;
    struct vlapic *vlapic = vcpu_vlapic(v);

    switch ( gp )
    {
    CASE_GET_REG(EAX, eax);
    CASE_GET_REG(ECX, ecx);
    CASE_GET_REG(EDX, edx);
    CASE_GET_REG(EBX, ebx);
    CASE_GET_REG(EBP, ebp);
    CASE_GET_REG(ESI, esi);
    CASE_GET_REG(EDI, edi);
    CASE_EXTEND_GET_REG;
    case REG_ESP:
        value = __vmread(GUEST_RSP);
        break;
    default:
        gdprintk(XENLOG_ERR, "invalid gp: %d\n", gp);
        goto exit_and_crash;
    }

    HVMTRACE_2D(CR_WRITE, v, cr, value);

    HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);

    switch ( cr )
    {
    case 0:
        return vmx_set_cr0(value);

    case 3:
        /*
         * If paging is not enabled yet, simply copy the value to CR3.
         */
        if ( !vmx_paging_enabled(v) )
        {
            v->arch.hvm_vmx.cpu_cr3 = value;
            break;
        }

        /*
         * We make a new one if the shadow does not exist.
         */
        if ( value == v->arch.hvm_vmx.cpu_cr3 ) {
            /*
             * This is simple TLB flush, implying the guest has
             * removed some translation or changed page attributes.
             * We simply invalidate the shadow.
             */
            mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
            if ( mfn != pagetable_get_pfn(v->arch.guest_table) )
                goto bad_cr3;
            paging_update_cr3(v);
        } else {
            /*
             * If different, make a shadow. Check if the PDBR is valid
             * first.
             */
            HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
            mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
            if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), v->domain) )
                goto bad_cr3;
            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
            v->arch.guest_table = pagetable_from_pfn(mfn);
            if ( old_base_mfn )
                put_page(mfn_to_page(old_base_mfn));
            v->arch.hvm_vmx.cpu_cr3 = value;
            update_cr3(v);
            HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
        }
        break;

    case 4: /* CR4 */
        old_cr = v->arch.hvm_vmx.cpu_shadow_cr4;

        if ( value & HVM_CR4_GUEST_RESERVED_BITS )
        {
            HVM_DBG_LOG(DBG_LEVEL_1,
                        "Guest attempts to set reserved bit in CR4: %lx",
                        value);
            vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
            return 0;
        }

        if ( (value & X86_CR4_PAE) && !(old_cr & X86_CR4_PAE) )
        {
            if ( vmx_pgbit_test(v) )
            {
                /* The guest is a 32-bit PAE guest. */
#if CONFIG_PAGING_LEVELS >= 3
                unsigned long mfn, old_base_mfn;
                mfn = get_mfn_from_gpfn(v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT);
                if ( !mfn_valid(mfn) ||
                     !get_page(mfn_to_page(mfn), v->domain) )
                    goto bad_cr3;

                /*
                 * Now arch.guest_table points to machine physical.
                 */
                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
                v->arch.guest_table = pagetable_from_pfn(mfn);
                if ( old_base_mfn )
                    put_page(mfn_to_page(old_base_mfn));

                HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                            (unsigned long) (mfn << PAGE_SHIFT));

                HVM_DBG_LOG(DBG_LEVEL_VMMU,
                            "Update CR3 value = %lx, mfn = %lx",
                            v->arch.hvm_vmx.cpu_cr3, mfn);
#endif
            }
        }
        else if ( !(value & X86_CR4_PAE) )
        {
            if ( unlikely(vmx_long_mode_enabled(v)) )
            {
                HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while "
                            "EFER.LMA is set");
                vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
                return 0;
            }
        }

        __vmwrite(GUEST_CR4, value | HVM_CR4_HOST_MASK);
        v->arch.hvm_vmx.cpu_shadow_cr4 = value;
        __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);

        /*
         * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
         * all TLB entries except global entries.
         */
        if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
            paging_update_paging_modes(v);

        break;

    case 8:
        vlapic_set_reg(vlapic, APIC_TASKPRI, ((value & 0x0F) << 4));
        break;

    default:
        gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
        domain_crash(v->domain);
        return 0;
    }

    return 1;

 bad_cr3:
    gdprintk(XENLOG_ERR, "Invalid CR3\n");
 exit_and_crash:
    domain_crash(v->domain);
    return 0;
}

/*
 * Read from control registers. CR0 and CR4 are read from the shadow.
 */
static void mov_from_cr(int cr, int gp, struct cpu_user_regs *regs)
{
    unsigned long value = 0;
    struct vcpu *v = current;
    struct vlapic *vlapic = vcpu_vlapic(v);

    switch ( cr )
    {
    case 3:
        value = (unsigned long)v->arch.hvm_vmx.cpu_cr3;
        break;
    case 8:
        value = (unsigned long)vlapic_get_reg(vlapic, APIC_TASKPRI);
        value = (value & 0xF0) >> 4;
        break;
    default:
        gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr);
        domain_crash(v->domain);
        break;
    }

    switch ( gp ) {
    CASE_SET_REG(EAX, eax);
    CASE_SET_REG(ECX, ecx);
    CASE_SET_REG(EDX, edx);
    CASE_SET_REG(EBX, ebx);
    CASE_SET_REG(EBP, ebp);
    CASE_SET_REG(ESI, esi);
    CASE_SET_REG(EDI, edi);
    CASE_EXTEND_SET_REG;
    case REG_ESP:
        __vmwrite(GUEST_RSP, value);
        regs->esp = value;
        break;
    default:
        printk("invalid gp: %d\n", gp);
        domain_crash(v->domain);
        break;
    }

    HVMTRACE_2D(CR_READ, v, cr, value);

    HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
}

static int vmx_cr_access(unsigned long exit_qualification,
                         struct cpu_user_regs *regs)
{
    unsigned int gp, cr;
    unsigned long value;
    struct vcpu *v = current;

    switch ( exit_qualification & CONTROL_REG_ACCESS_TYPE ) {
    case TYPE_MOV_TO_CR:
        gp = exit_qualification & CONTROL_REG_ACCESS_REG;
        cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
        return mov_to_cr(gp, cr, regs);
    case TYPE_MOV_FROM_CR:
        gp = exit_qualification & CONTROL_REG_ACCESS_REG;
        cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
        mov_from_cr(cr, gp, regs);
        break;
    case TYPE_CLTS:
        /* We initialise the FPU now, to avoid needing another vmexit. */
        setup_fpu(v);
        __vm_clear_bit(EXCEPTION_BITMAP, TRAP_no_device);

        v->arch.hvm_vmx.cpu_cr0 &= ~X86_CR0_TS; /* clear TS */
        __vmwrite(GUEST_CR0, v->arch.hvm_vmx.cpu_cr0);

        v->arch.hvm_vmx.cpu_shadow_cr0 &= ~X86_CR0_TS; /* clear TS */
        __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
        break;
    case TYPE_LMSW:
        value = v->arch.hvm_vmx.cpu_shadow_cr0;
        value = (value & ~0xF) |
            (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF);
        return vmx_set_cr0(value);
    default:
        BUG();
    }

    return 1;
}

static int vmx_do_msr_read(struct cpu_user_regs *regs)
{
    u64 msr_content = 0;
    u32 ecx = regs->ecx, eax, edx;
    struct vcpu *v = current;

    HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);

    switch ( ecx )
    {
    case MSR_IA32_TIME_STAMP_COUNTER:
        msr_content = hvm_get_guest_time(v);
        break;
    case MSR_IA32_SYSENTER_CS:
        msr_content = (u32)__vmread(GUEST_SYSENTER_CS);
        break;
    case MSR_IA32_SYSENTER_ESP:
        msr_content = __vmread(GUEST_SYSENTER_ESP);
        break;
    case MSR_IA32_SYSENTER_EIP:
        msr_content = __vmread(GUEST_SYSENTER_EIP);
        break;
    case MSR_IA32_APICBASE:
        msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
        break;
    case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_CR4_FIXED1:
        goto gp_fault;
    case MSR_IA32_MCG_CAP:
    case MSR_IA32_MCG_STATUS:
    case MSR_IA32_MC0_STATUS:
    case MSR_K8_MC1_STATUS:
    case MSR_K8_MC2_STATUS:
    case MSR_K8_MC3_STATUS:
    case MSR_K8_MC4_STATUS:
    case MSR_K8_MC5_STATUS:
        /* No point in letting the guest see real MCEs */
        msr_content = 0;
        break;
    default:
        if ( long_mode_do_msr_read(regs) )
            goto done;

        if ( rdmsr_hypervisor_regs(ecx, &eax, &edx) ||
             rdmsr_safe(ecx, eax, edx) == 0 )
        {
            regs->eax = eax;
            regs->edx = edx;
            goto done;
        }

        goto gp_fault;
    }

    regs->eax = msr_content & 0xFFFFFFFF;
    regs->edx = msr_content >> 32;

done:
    HVMTRACE_2D(MSR_READ, v, ecx, msr_content);
    HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
                ecx, (unsigned long)regs->eax,
                (unsigned long)regs->edx);
    return 1;

gp_fault:
    vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
    return 0;
}

static int vmx_alloc_vlapic_mapping(struct domain *d)
{
    void *apic_va;

    if ( !cpu_has_vmx_virtualize_apic_accesses )
        return 0;

    apic_va = alloc_xenheap_page();
    if ( apic_va == NULL )
        return -ENOMEM;
    share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
    guest_physmap_add_page(
        d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va));
    d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);

    return 0;
}

static void vmx_free_vlapic_mapping(struct domain *d)
{
    unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
    if ( (mfn != 0) && mfn_valid(mfn) )
        free_xenheap_page(mfn_to_virt(mfn));
}

static void vmx_install_vlapic_mapping(struct vcpu *v)
{
    unsigned long virt_page_ma, apic_page_ma;

    if ( !cpu_has_vmx_virtualize_apic_accesses )
        return;

    virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
    apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
    apic_page_ma <<= PAGE_SHIFT;

    vmx_vmcs_enter(v);
    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
    __vmwrite(APIC_ACCESS_ADDR, apic_page_ma);
    vmx_vmcs_exit(v);
}

static void vmx_check_vlapic_msr(struct vcpu *v)
{
    struct vlapic *vlapic = vcpu_vlapic(v);
    uint32_t ctl;

    if ( !cpu_has_vmx_virtualize_apic_accesses )
        return;

    ctl  = __vmread(SECONDARY_VM_EXEC_CONTROL);
    ctl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
    if ( !vlapic_hw_disabled(vlapic) &&
         (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) )
        ctl |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
    __vmwrite(SECONDARY_VM_EXEC_CONTROL, ctl);
}

static int vmx_do_msr_write(struct cpu_user_regs *regs)
{
    u32 ecx = regs->ecx;
    u64 msr_content;
    struct vcpu *v = current;

    HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
                ecx, (u32)regs->eax, (u32)regs->edx);

    msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
    HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content);

    switch ( ecx )
    {
    case MSR_IA32_TIME_STAMP_COUNTER:
        hvm_set_guest_time(v, msr_content);
        pt_reset(v);
        break;
    case MSR_IA32_SYSENTER_CS:
        __vmwrite(GUEST_SYSENTER_CS, msr_content);
        break;
    case MSR_IA32_SYSENTER_ESP:
        __vmwrite(GUEST_SYSENTER_ESP, msr_content);
        break;
    case MSR_IA32_SYSENTER_EIP:
        __vmwrite(GUEST_SYSENTER_EIP, msr_content);
        break;
    case MSR_IA32_APICBASE:
        vlapic_msr_set(vcpu_vlapic(v), msr_content);
        vmx_check_vlapic_msr(v);
        break;
    case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_CR4_FIXED1:
        goto gp_fault;
    default:
        if ( !long_mode_do_msr_write(regs) )
            wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
        break;
    }

    return 1;

gp_fault:
    vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
    return 0;
}

static void vmx_do_hlt(void)
{
    unsigned long rflags;
    HVMTRACE_0D(HLT, current);
    rflags = __vmread(GUEST_RFLAGS);
    hvm_hlt(rflags);
}

static void vmx_do_extint(struct cpu_user_regs *regs)
{
    unsigned int vector;

    asmlinkage void do_IRQ(struct cpu_user_regs *);
    fastcall void smp_apic_timer_interrupt(struct cpu_user_regs *);
    fastcall void smp_event_check_interrupt(void);
    fastcall void smp_invalidate_interrupt(void);
    fastcall void smp_call_function_interrupt(void);
    fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
    fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
#ifdef CONFIG_X86_MCE_P4THERMAL
    fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
#endif

    vector = __vmread(VM_EXIT_INTR_INFO);
    BUG_ON(!(vector & INTR_INFO_VALID_MASK));

    vector &= INTR_INFO_VECTOR_MASK;
    HVMTRACE_1D(INTR, current, vector);

    switch(vector) {
    case LOCAL_TIMER_VECTOR:
        smp_apic_timer_interrupt(regs);
        break;
    case EVENT_CHECK_VECTOR:
        smp_event_check_interrupt();
        break;
    case INVALIDATE_TLB_VECTOR:
        smp_invalidate_interrupt();
        break;
    case CALL_FUNCTION_VECTOR:
        smp_call_function_interrupt();
        break;
    case SPURIOUS_APIC_VECTOR:
        smp_spurious_interrupt(regs);
        break;
    case ERROR_APIC_VECTOR:
        smp_error_interrupt(regs);
        break;
#ifdef CONFIG_X86_MCE_P4THERMAL
    case THERMAL_APIC_VECTOR:
        smp_thermal_interrupt(regs);
        break;
#endif
    default:
        regs->entry_vector = vector;
        do_IRQ(regs);
        break;
    }
}

static void vmx_reflect_exception(struct vcpu *v)
{
    int error_code, intr_info, vector;

    intr_info = __vmread(VM_EXIT_INTR_INFO);
    vector = intr_info & 0xff;
    if ( intr_info & INTR_INFO_DELIVER_CODE_MASK )
        error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
    else
        error_code = VMX_DELIVER_NO_ERROR_CODE;

#ifndef NDEBUG
    {
        unsigned long rip;

        rip = __vmread(GUEST_RIP);
        HVM_DBG_LOG(DBG_LEVEL_1, "rip = %lx, error_code = %x",
                    rip, error_code);
    }
#endif /* NDEBUG */

    /*
     * According to Intel Virtualization Technology Specification for
     * the IA-32 Intel Architecture (C97063-002 April 2005), section
     * 2.8.3, SW_EXCEPTION should be used for #BP and #OV, and
     * HW_EXCEPTION used for everything else.  The main difference
     * appears to be that for SW_EXCEPTION, the EIP/RIP is incremented
     * by VM_ENTER_INSTRUCTION_LEN bytes, whereas for HW_EXCEPTION,
     * it is not.
     */
    if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_SW_EXCEPTION )
    {
        int ilen = __get_instruction_length(); /* Safe: software exception */
        vmx_inject_sw_exception(v, vector, ilen);
    }
    else
    {
        vmx_inject_hw_exception(v, vector, error_code);
    }
}

static void vmx_failed_vmentry(unsigned int exit_reason,
                               struct cpu_user_regs *regs)
{
    unsigned int failed_vmentry_reason = (uint16_t)exit_reason;
    unsigned long exit_qualification;

    exit_qualification = __vmread(EXIT_QUALIFICATION);
    printk("Failed vm entry (exit reason 0x%x) ", exit_reason);
    switch ( failed_vmentry_reason )
    {
    case EXIT_REASON_INVALID_GUEST_STATE:
        printk("caused by invalid guest state (%ld).\n", exit_qualification);
        break;
    case EXIT_REASON_MSR_LOADING:
        printk("caused by MSR entry %ld loading.\n", exit_qualification);
        break;
    case EXIT_REASON_MACHINE_CHECK:
        printk("caused by machine check.\n");
        HVMTRACE_0D(MCE, current);
        vmx_store_cpu_guest_regs(current, regs, NULL);
        do_machine_check(regs);
        break;
    default:
        printk("reason not known yet!");
        break;
    }

    printk("************* VMCS Area **************\n");
    vmcs_dump_vcpu();
    printk("**************************************\n");

    domain_crash(current->domain);
}

asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
{
    unsigned int exit_reason;
    unsigned long exit_qualification, inst_len = 0;
    struct vcpu *v = current;

    exit_reason = __vmread(VM_EXIT_REASON);

    HVMTRACE_2D(VMEXIT, v, __vmread(GUEST_RIP), exit_reason);

    perfc_incra(vmexits, exit_reason);

    if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
        local_irq_enable();

    if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
        return vmx_failed_vmentry(exit_reason, regs);

    hvm_maybe_deassert_evtchn_irq();

    switch ( exit_reason )
    {
    case EXIT_REASON_EXCEPTION_NMI:
    {
        /*
         * We don't set the software-interrupt exiting (INT n).
         * (1) We can get an exception (e.g. #PG) in the guest, or
         * (2) NMI
         */
        unsigned int intr_info, vector;

        intr_info = __vmread(VM_EXIT_INTR_INFO);
        BUG_ON(!(intr_info & INTR_INFO_VALID_MASK));

        vector = intr_info & INTR_INFO_VECTOR_MASK;

        perfc_incra(cause_vector, vector);

        switch ( vector )
        {
        case TRAP_debug:
        case TRAP_int3:
            if ( !v->domain->debugger_attached )
                goto exit_and_crash;
            domain_pause_for_debugger();
            break;
        case TRAP_no_device:
            vmx_do_no_device_fault();
            break;
        case TRAP_page_fault:
            exit_qualification = __vmread(EXIT_QUALIFICATION);
            regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);

            HVM_DBG_LOG(DBG_LEVEL_VMMU,
                        "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
                        (unsigned long)regs->eax, (unsigned long)regs->ebx,
                        (unsigned long)regs->ecx, (unsigned long)regs->edx,
                        (unsigned long)regs->esi, (unsigned long)regs->edi);

            if ( paging_fault(exit_qualification, regs) )
            {
                HVMTRACE_2D(PF_XEN, v, exit_qualification, regs->error_code);
                break;
            }

            v->arch.hvm_vmx.cpu_cr2 = exit_qualification;
            vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
            break;
        case TRAP_nmi:
            if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI )
            {
                HVMTRACE_0D(NMI, v);
                vmx_store_cpu_guest_regs(v, regs, NULL);
                do_nmi(regs); /* Real NMI, vector 2: normal processing. */
            }
            else
                vmx_reflect_exception(v);
            break;
        case TRAP_machine_check:
            HVMTRACE_0D(MCE, v);
            vmx_store_cpu_guest_regs(v, regs, NULL);
            do_machine_check(regs);
            break;
        default:
            goto exit_and_crash;
        }
        break;
    }
    case EXIT_REASON_EXTERNAL_INTERRUPT:
        vmx_do_extint(regs);
        break;
    case EXIT_REASON_TRIPLE_FAULT:
        hvm_triple_fault();
        break;
    case EXIT_REASON_PENDING_VIRT_INTR:
        /* Disable the interrupt window. */
        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
        __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
                  v->arch.hvm_vmx.exec_control);
        break;
    case EXIT_REASON_PENDING_VIRT_NMI:
        /* Disable the NMI window. */
        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
        __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
                  v->arch.hvm_vmx.exec_control);
        break;
    case EXIT_REASON_TASK_SWITCH:
        goto exit_and_crash;
    case EXIT_REASON_CPUID:
        inst_len = __get_instruction_length(); /* Safe: CPUID */
        __update_guest_eip(inst_len);
        vmx_do_cpuid(regs);
        break;
    case EXIT_REASON_HLT:
        inst_len = __get_instruction_length(); /* Safe: HLT */
        __update_guest_eip(inst_len);
        vmx_do_hlt();
        break;
    case EXIT_REASON_INVLPG:
    {
        inst_len = __get_instruction_length(); /* Safe: INVLPG */
        __update_guest_eip(inst_len);
        exit_qualification = __vmread(EXIT_QUALIFICATION);
        vmx_do_invlpg(exit_qualification);
        break;
    }
    case EXIT_REASON_VMCALL:
    {
        int rc;
        HVMTRACE_1D(VMMCALL, v, regs->eax);
        inst_len = __get_instruction_length(); /* Safe: VMCALL */
        rc = hvm_do_hypercall(regs);
        if ( rc != HVM_HCALL_preempted )
        {
            __update_guest_eip(inst_len);
            if ( rc == HVM_HCALL_invalidate )
                send_invalidate_req();
        }
        break;
    }
    case EXIT_REASON_CR_ACCESS:
    {
        exit_qualification = __vmread(EXIT_QUALIFICATION);
        inst_len = __get_instruction_length(); /* Safe: MOV Cn, LMSW, CLTS */
        if ( vmx_cr_access(exit_qualification, regs) )
            __update_guest_eip(inst_len);
        break;
    }
    case EXIT_REASON_DR_ACCESS:
        exit_qualification = __vmread(EXIT_QUALIFICATION);
        vmx_dr_access(exit_qualification, regs);
        break;
    case EXIT_REASON_IO_INSTRUCTION:
        exit_qualification = __vmread(EXIT_QUALIFICATION);
        inst_len = __get_instruction_length(); /* Safe: IN, INS, OUT, OUTS */
        vmx_io_instruction(exit_qualification, inst_len);
        break;
    case EXIT_REASON_MSR_READ:
        inst_len = __get_instruction_length(); /* Safe: RDMSR */
        if ( vmx_do_msr_read(regs) )
            __update_guest_eip(inst_len);
        break;
    case EXIT_REASON_MSR_WRITE:
        inst_len = __get_instruction_length(); /* Safe: WRMSR */
        if ( vmx_do_msr_write(regs) )
            __update_guest_eip(inst_len);
        break;

    case EXIT_REASON_MWAIT_INSTRUCTION:
    case EXIT_REASON_MONITOR_INSTRUCTION:
    case EXIT_REASON_VMCLEAR:
    case EXIT_REASON_VMLAUNCH:
    case EXIT_REASON_VMPTRLD:
    case EXIT_REASON_VMPTRST:
    case EXIT_REASON_VMREAD:
    case EXIT_REASON_VMRESUME:
    case EXIT_REASON_VMWRITE:
    case EXIT_REASON_VMXOFF:
    case EXIT_REASON_VMXON:
        vmx_inject_hw_exception(v, TRAP_invalid_op, VMX_DELIVER_NO_ERROR_CODE);
        break;

    case EXIT_REASON_TPR_BELOW_THRESHOLD:
        break;

    case EXIT_REASON_APIC_ACCESS:
    {
        unsigned long offset;
        exit_qualification = __vmread(EXIT_QUALIFICATION);
        offset = exit_qualification & 0x0fffUL;
        handle_mmio(APIC_DEFAULT_PHYS_BASE | offset);
        break;
    }

    case EXIT_REASON_INVD:
    {
        inst_len = __get_instruction_length(); /* Safe: INVD */
        __update_guest_eip(inst_len);
        break;
    }

    default:
    exit_and_crash:
        gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
        domain_crash(v->domain);
        break;
    }
}

asmlinkage void vmx_trace_vmentry(void)
{
    struct vcpu *v = current;
    HVMTRACE_0D(VMENTRY, v);
}

/*
 * Local variables:
 * mode: C
 * c-set-style: "BSD"
 * c-basic-offset: 4
 * tab-width: 4
 * indent-tabs-mode: nil
 * End:
 */
