/* Basic suspend infrastructure */
#include "ntddk.h"
#include "xsapi.h"
#include "hypercall.h"
#include "xenhdrs/sched.h"
#include "xscompat.h"
#include "hvm.h"

static KSPIN_LOCK
_g_QuiesceLock;
/* The number of processors which haven't been captured in their DPCs
   yet. */
static LONG
_g_ProcessorSpinCount1;
/* A refcount on the processorCount local in QuiesceSystem(), in that
   that stack location won't be reused until this has gone to 0. */
static LONG
_g_ProcessorSpinCount2;
static LONG
_g_QuiescePhase;
static KDPC
_g_ProcessorSpinDpc[MAXIMUM_PROCESSORS];

/* At various points we need to capture all of the processors in the
   system and get them into a known state (e.g. binpatch,
   suspend/resume).  The protocol goes as follows:

   0) We start in phase 0.

   1) Thread A decides it wants to quiesce the system.

   2) Thread A acquires the QuiesceLock.  This locks it to a
      particular vcpu (say vcpu A), raises to DISPATCH_LEVEL, and
      prevents any other thread trying to do a quiesce at the same
      time.

   3) Vcpu A sets SpinCount1 to the number of vcpus not currently
      running our stuff at DPC level (i.e. nr_cpus - 1).  It sets
      processorCount to the number of vcpus with interrupts enabled
      (i.e. the total number of cpus).  It sets SpinCount2 to the
      number of other vcpus which might be referencing processorCount
      (i.e. nr_cpus - 1).

   4) Vcpu A launches DPCs at every other vcpu in the system.  The
      vcpus acknowledge that they're running by decrementing
      SpinCount1.  Vcpu A waits for SpinCount1 to hit 0, which
      indicates that every vcpu is now captured at DISPATCH_LEVEL.
      i.e. we no longer have to worry about deadlocking with someone
      else's DPC.

   5) Once every other vcpu is spinning in our DPC, we go to phase 1.
      This causes the other vcpus to disable interrupts and decrement
      processorCount.  Once processorCount goes to 0, every vcpu
      advances to the next phase. This is tricky, because we could
      still deadlock if an interrupt ever sends a synchronous IPI, and
      it looks like some bits of Windows do that.  We therefore have a
      timeout.  Each vcpu runs the timeout independently.  If it
      fires, we cmpxchg processorCount back up again (if it's gone to
      0, we need to continue), re-enable interrupts, back off a bit,
      and retry.

   6) Once a vcpu has seen processorCount go to 0, it decrements
      SpinCount2 to publicise that fact.  If it's an auxiliary vcpu,
      it's now finished with the quiesce, and sits waiting for an
      unquiesce.

   7) If it was the initiating vcpu, it waits for SpinCount2 to go to
      zero, so that it can deallocate processorCount.

   8) Once SpinCount2 goes to 0 on the initiator, the system is fully
      quiescent, and QuiesceSystem() returns.

   9) Eventually, we finish whatever it was we were doing in the
      quiescent critical section, and call UnquiesceSystem().
      UnquiesceSystem() sets the QuiescePhase back to 0, releasing the
      auxiliaries which are waiting at step 6.  The auxiliaries
      reenable interrupts and return from their DPCs.

   10) The initiator finally drops the quiesce lock and continues
       running normally.

   We need to do a two-stage capture to avoid a three-way deadlock
   with IPIs: vcpu A starts to quiesce the system at the same time as
   vcpu B tries to IPI vcpu C for a TLB flush.  If vcpu A gets to C
   before B does, C will be stuck spinning waiting for the quiesce to
   complete, but that can't happen until A captures B, but that can't
   happen until C completes the flush for B.  Leaving interrupts
   enabled until all vcpus are in the DPC prevents this, since only
   interrupts can preempt a DPC and we know that Windos never sends
   IPIs from interrupts.
*/

#define TSC_SPIN_LIMIT 2000000000

static KIRQL
GatherInterruptDisabledProcessors(
    IN  PLONG Count
    )
{
    KIRQL oldIrql;
    int spinCount;
    ULONG64 tsc_start = ReadTimeStampCounter();
    ULONG64 tsc_now;

    do
    {
        tsc_now = ReadTimeStampCounter();
        if (tsc_now - tsc_start >= TSC_SPIN_LIMIT) {
            TraceWarning(("Took a very long time to corral all processors with interrupts disabled %d: started at %I64x, still going at %I64x\n",
                          KeGetCurrentProcessorNumber(), tsc_start, tsc_now));
            tsc_start = tsc_now;
        }

        /* Raise to high level, blocking all interrupts. */
        KeRaiseIrql(HIGH_LEVEL, &oldIrql);
        InterlockedDecrement(Count);

        spinCount = 0;
        while (*Count)
        {
            spinCount++;
#define QUIESCE_WAIT 2048
            if (spinCount > QUIESCE_WAIT)
            {
                /* Back off.  Bump the number of processors which haven't
                 * achieved interrupt disabled state and reenable interrupts.
                 * N.B. Must never bump from zero as if zero is seen other
                 * processors will have advanced out of this routine. */
                LONG old = *Count;
                LONG new = old + 1;

                if (old == 0)
                {
                    break;
                }
                if (InterlockedCompareExchange(Count, new, old) == old)
                {
                    /* The exchange happened, the count has been safely
                     * incremented. */
                    KeLowerIrql(oldIrql);
                    spinCount = 0;
                    break;
                }
            }
        }
        HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
        XsMemoryBarrier();
    } while (*Count);

    return oldIrql;
}

static VOID
SpinProcessorDpc(
    IN  PKDPC Dpc,
    IN  PVOID DeferredContext,
    IN  PVOID SystemArgument1,
    IN  PVOID SystemArgument2
    )
{
    KIRQL oldIrql;

    UNREFERENCED_PARAMETER(Dpc);
    UNREFERENCED_PARAMETER(SystemArgument1);
    UNREFERENCED_PARAMETER(SystemArgument2);

    /* Tell the initiator that we're in the DPC. */
    ASSERT(_g_ProcessorSpinCount1 > 0);
    InterlockedDecrement(&_g_ProcessorSpinCount1);

    /* Wait for every other vcpu to get ready. */
    while ( _g_QuiescePhase == 0 )
    {
        HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
        XsMemoryBarrier();
    }

    oldIrql = GatherInterruptDisabledProcessors((PLONG)DeferredContext);

    /* Tell the initiator we're now fully quiescent. */
    /* You're not allowed to touch DeferredContext after this
     * completes. */
    ASSERT(_g_ProcessorSpinCount2 > 0);
    InterlockedDecrement(&_g_ProcessorSpinCount2);

    /* Wait for the initiator to release us. */
    while ( _g_QuiescePhase != 0 )
    {
        HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
        XsMemoryBarrier();
    }

    /* We're done. */
    XsMemoryBarrier();
    FLUSH_PIPELINE();

    KeLowerIrql(oldIrql);
}

/* Get every other CPU into a known state (spinning in
 * SpinProcessDpc).  Raises IRQL to HIGH_LEVEL and returns the old
 * IRQL.  The idea is that once you've called this, nothing apart from
 * you will run until you call UnquiesceSystem. */
/* It's really easy to deadlock yourself using this.  If you ever have
 * a DPC which waits for something on another CPU, anywhere in the
 * system, we can deadlock with it.  Even worse, this is a fundamental
 * property of the thing we're trying to achieve.  Work around this
 * using a timeout which backs off and retries if it looks like we're
 * taking too long. */
KIRQL
QuiesceSystem(void)
{
    UCHAR number;
    ULONG me;
    KIRQL oldIrql;
    PKDPC dpc;
    LONG NumberOfCpus = KeNumberProcessors;
    ULONG64 tsc_start;
    ULONG64 tsc_now;
    LONG processorCount;

    ASSERT(_g_QuiescePhase == 0);

    KeAcquireSpinLock(&_g_QuiesceLock, &oldIrql);

    /* We don't want to use system time for the main capture process,
       because some HALs update it from a ticker interrupt, and we
       might have turned the ticker off on some subset of the CPUs
       when things go wrong. */
    tsc_start = ReadTimeStampCounter();

    _g_ProcessorSpinCount1 = _g_ProcessorSpinCount2 = NumberOfCpus - 1;

    processorCount = NumberOfCpus;

    me = KeGetCurrentProcessorNumber();
    for (number = 0; number < NumberOfCpus; number++)
    {
        if (number == me)
        {
            //
            // Don't send a message to yourself.
            //

            continue;
        }
        dpc = &_g_ProcessorSpinDpc[number];
        KeInitializeDpc(dpc, SpinProcessorDpc, &processorCount);
        KeSetTargetProcessorDpc(dpc, number);
        KeSetImportanceDpc(dpc, HighImportance);
        KeInsertQueueDpc(dpc, NULL, NULL);
    }

    /* Wait for the DPCs to start on all vcpus. */
    while (_g_ProcessorSpinCount1) {
        HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
        XsMemoryBarrier();

        tsc_now = ReadTimeStampCounter();
        /* XXX Arbitrary limit of 2000000000 cycles before we print a
           warning.  This gives a plausible timeout for any processor
           with a clockspeed between about 500MHz and about 20GHz, and
           avoids needing to calibrate the TSC or rely on the system
           timers. */
        if (tsc_now - tsc_start >= TSC_SPIN_LIMIT) {
            TraceWarning(("Took a very long time to start DPCs: started at %I64x, still going at %I64x\n",
                          tsc_start, tsc_now));
        }
    }

    /* All CPUs captured.  Turn interrupts off on other cpus. */
    _g_QuiescePhase = 1;

    /* Capture other cpus and disable interrupts. */

    GatherInterruptDisabledProcessors(&processorCount);

    /* Wait for other CPUs to acknowledge they have advanced out of the loop
     * where they were waiting for everyone to disable interrupts.
     *
     * WARNING: Don't return from this routine until you *KNOW* that all DPCs
     * have advanced beyond the point where they are using the DPC's deferred
     * context parameter which is the local variable processorCount in this
     * routine.
     *
     * This wait accomplishes that goal.  Also, the two global SpinCounts are
     * not used again after this loop on any processor, until the next suspend.
     */

    tsc_start = ReadTimeStampCounter();
    while (_g_ProcessorSpinCount2) {
        HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
        XsMemoryBarrier();

        tsc_now = ReadTimeStampCounter();
        if (tsc_now - tsc_start >= TSC_SPIN_LIMIT) {
            TraceWarning(("Took a very long time to advance DPCs beyond wait for interrupts disabled: started at %I64x, still going at %I64x\n",
                          tsc_start, tsc_now));
            tsc_start = tsc_now;
        }
    }

    /* We need to have seen _g_ProcessorSpinCount2 go to 0 before it's
     * safe to reuse the stack location which holds processorCount. */
    XsMemoryBarrier();

    return oldIrql;
}

/* Undo the effect of UnquiesceAllProcessors.  This is also a very
 * strong memory barrier, including flushing processor pipelines
 * etc. */
void
UnquiesceSystem(
    IN  KIRQL OldIrql
    )
{
#if 0
    ULONG64 tsc_start;
    ULONG64 tsc_now;
#endif

    XsMemoryBarrier();
    FLUSH_PIPELINE();

    /* Release the other vcpus */
    _g_QuiescePhase = 0;

    /* Pacify driver verifier, which insists that spin locks are only
       ever released from DISPATCH_LEVEL. */
    KeLowerIrql(DISPATCH_LEVEL);

    KeReleaseSpinLock(&_g_QuiesceLock, OldIrql);
#undef TSC_SPIN_LIMIT
}

