/****************************************************************************
 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
 * (C) 2002-2003 University of Cambridge
 * (C) 2004      - Mark Williamson - Intel Research Cambridge
 ****************************************************************************
 *
 *        File: common/schedule.c
 *      Author: Rolf Neugebauer & Keir Fraser
 *              Updated for generic API by Mark Williamson
 * 
 * Description: Generic CPU scheduling code
 *              implements support functionality for the Xen scheduler API.
 *
 */

#ifndef COMPAT
#include <xen/config.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/sched.h>
#include <xen/domain.h>
#include <xen/delay.h>
#include <xen/event.h>
#include <xen/time.h>
#include <xen/timer.h>
#include <xen/perfc.h>
#include <xen/sched-if.h>
#include <xen/softirq.h>
#include <xen/trace.h>
#include <xen/mm.h>
#include <xen/errno.h>
#include <xen/guest_access.h>
// #include <xen/multicall.h>
#include <public/sched.h>
#include <xsm/xsm.h>

/* opt_sched: scheduler - default to credit */
static char opt_sched[10] =
#ifndef __KXEN__
      "credit"
#else
      "host"
#endif
      ;
string_param("sched", opt_sched);

#ifndef __KXEN__
#define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */

/* Various timer handlers. */
static void s_timer_fn(void *unused);
#endif
static void vcpu_periodic_timer_fn(void *data);
static void vcpu_singleshot_timer_fn(void *data);
static void poll_timer_fn(void *data);

/* This is global for now so that private implementations can reach it */
DEFINE_PER_CPU(struct schedule_data, schedule_data);

#ifndef __KXEN__
extern struct scheduler sched_sedf_def;
extern struct scheduler sched_credit_def;
static struct scheduler *schedulers[] = { 
    &sched_sedf_def,
    &sched_credit_def,
    NULL
};
#else
extern struct scheduler sched_host_def;
#endif

static struct scheduler ops;

#define SCHED_OP(fn, ...)                                 \
         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
          : (typeof(ops.fn(__VA_ARGS__)))0 )

static inline void trace_runstate_change(struct vcpu *v, int new_state)
{
    struct { uint32_t vcpu:16, domain:16; } d;
    uint32_t event;

    if ( likely(!tb_init_done) )
        return;

    d.vcpu = v->vcpu_id;
    d.domain = v->domain->domain_id;

    event = TRC_SCHED_RUNSTATE_CHANGE;
    event |= ( v->runstate.state & 0x3 ) << 8;
    event |= ( new_state & 0x3 ) << 4;

    __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
}

/* Used to quickly map the vcpu runstate mask to a domain runstate */
static int mask_to_state[] = {
    /* 000: Nothing in any runstate.  Should never happen. */
    -1,
    /* 001: All running */
    DOMAIN_RUNSTATE_full_run,
    /* 010: All runnable */
    DOMAIN_RUNSTATE_full_contention,
    /* 011: Some running, some runnable */
    DOMAIN_RUNSTATE_concurrency_hazard,
    /* 100: All blocked / offline */
    DOMAIN_RUNSTATE_blocked,
    /* 101: Some running, some blocked / offline */
    DOMAIN_RUNSTATE_partial_run,
    /* 110: Some blocked / offline, some runnable */
    DOMAIN_RUNSTATE_partial_contention,
    /* 111: Some in every state.  Mixed running + runnable is most important. */
    DOMAIN_RUNSTATE_concurrency_hazard
};

static inline void vcpu_runstate_change(
    struct vcpu *v, int new_state, s_time_t new_entry_time)
{
    struct domain *d = v->domain;
    ASSERT(v->runstate.state != new_state);
    ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));

    trace_runstate_change(v, new_state);

    v->runstate.time[v->runstate.state] +=
        new_entry_time - v->runstate.state_entry_time;
    v->runstate.state_entry_time = new_entry_time;
    v->runstate.state = new_state;

    /* Update domain runstate */
    if ( spin_trylock(&d->runstate_lock) )
    {
        unsigned mask=0;
        struct vcpu *ov;

        BUG_ON(d->runstate.state > DOMAIN_RUNSTATE_partial_contention);

        d->runstate.time[d->runstate.state] +=
            (new_entry_time - d->runstate.state_entry_time);
        d->runstate.state_entry_time = new_entry_time;

        /* Determine new runstate.  First, see what states we have */
        for_each_vcpu(d, ov) {
            /* Don't count vcpus that have beent taken offline by the guest */ 
            if ( !(ov->runstate.state == RUNSTATE_offline
                   && test_bit(_VPF_down, &ov->pause_flags)) )
               mask |= (1 << ov->runstate.state);
        }

        BUG_ON(mask == 0);

        /* Offline & blocked are the same */
        mask |= ((1 << RUNSTATE_offline) & mask) >> 1;

        d->runstate.state = mask_to_state[mask&0x7];

        spin_unlock(&d->runstate_lock);
    } 
    else 
    {
        atomic_inc(&d->runstate_missed_changes);
    }
}

void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
{
    if ( likely(v == current) )
    {
        /* Fast lock-free path. */
        memcpy(runstate, &v->runstate, sizeof(*runstate));
        ASSERT(runstate->state == RUNSTATE_running);
        runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
    }
    else
    {
        vcpu_schedule_lock_irq(v);
        memcpy(runstate, &v->runstate, sizeof(*runstate));
        runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
        vcpu_schedule_unlock_irq(v);
    }
}

void domain_runstate_get(struct domain *d,
                         domain_runstate_info_t *runstate)
{
    spin_lock(&d->runstate_lock);

    memcpy(runstate, &d->runstate, sizeof(*runstate));
    runstate->time[d->runstate.state] += NOW() - runstate->state_entry_time;
    runstate->missed_changes = atomic_read(&d->runstate_missed_changes);

    spin_unlock(&d->runstate_lock);
}

int sched_init_vcpu(struct vcpu *v, unsigned int processor) 
{
    struct domain *d = v->domain;

    /*
     * Initialize processor and affinity settings. The idler, and potentially
     * domain-0 VCPUs, are pinned onto their respective physical CPUs.
     */
    v->processor = processor;
    if ( is_idle_domain(d) || d->is_pinned )
        v->cpu_affinity = cpumask_of_cpu(processor);
    else
        cpus_setall(v->cpu_affinity);

    /* Initialise the per-vcpu timers. */
    init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
               v, v->processor);
    init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
               v, v->processor);
    init_timer(&v->poll_timer, poll_timer_fn,
               v, v->processor);

    /* Idle VCPUs are scheduled immediately. */
    if ( is_idle_domain(d) )
    {
        per_cpu(schedule_data, v->processor).curr = v;
        per_cpu(schedule_data, v->processor).idle = v;
        v->is_running = 1;
    }

    TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);

    return SCHED_OP(init_vcpu, v);
}

void sched_destroy_vcpu(struct vcpu *v)
{
    kill_timer(&v->periodic_timer);
    kill_timer(&v->singleshot_timer);
    kill_timer(&v->poll_timer);
    SCHED_OP(destroy_vcpu, v);
}

int sched_init_domain(struct domain *d)
{
#ifndef __KXEN__
    return SCHED_OP(init_domain, d);
#else
    return 0;
#endif
}

void sched_destroy_domain(struct domain *d)
{
#ifndef __KXEN__
    SCHED_OP(destroy_domain, d);
#else
    BUG();
#endif
}

void vcpu_sleep_nosync(struct vcpu *v)
{
    unsigned long flags;

    vcpu_schedule_lock_irqsave(v, flags);

    if ( likely(!vcpu_runnable(v)) )
    {
        if ( v->runstate.state == RUNSTATE_runnable )
            vcpu_runstate_change(v, RUNSTATE_offline, NOW());

        SCHED_OP(sleep, v);
    }

    vcpu_schedule_unlock_irqrestore(v, flags);

    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
}

void vcpu_sleep_sync(struct vcpu *v)
{
    vcpu_sleep_nosync(v);

    while ( !vcpu_runnable(v) && v->is_running )
        cpu_relax();

    sync_vcpu_execstate(v);
}

void vcpu_wake(struct vcpu *v)
{
    unsigned long flags;

    vcpu_schedule_lock_irqsave(v, flags);

    if ( likely(vcpu_runnable(v)) )
    {
        if ( v->runstate.state >= RUNSTATE_blocked )
            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
        SCHED_OP(wake, v);
    }
    else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
    {
        if ( v->runstate.state == RUNSTATE_blocked )
            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
    }

    vcpu_schedule_unlock_irqrestore(v, flags);

    TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
}

static void vcpu_migrate(struct vcpu *v)
{
#ifndef __KXEN__
    unsigned long flags;
    int old_cpu;

    vcpu_schedule_lock_irqsave(v, flags);

    /*
     * NB. Check of v->running happens /after/ setting migration flag
     * because they both happen in (different) spinlock regions, and those
     * regions are strictly serialised.
     */
    if ( v->is_running ||
         !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
    {
        vcpu_schedule_unlock_irqrestore(v, flags);
        return;
    }

    /* Switch to new CPU, then unlock old CPU. */
    old_cpu = v->processor;
    v->processor = SCHED_OP(pick_cpu, v);
    spin_unlock_irqrestore(
        &per_cpu(schedule_data, old_cpu).schedule_lock, flags);

    /* Wake on new CPU. */
    vcpu_wake(v);
#endif
    BUG();
}

#ifndef __KXEN__
/*
 * Force a VCPU through a deschedule/reschedule path.
 * For example, using this when setting the periodic timer period means that
 * most periodic-timer state need only be touched from within the scheduler
 * which can thus be done without need for synchronisation.
 */
void vcpu_force_reschedule(struct vcpu *v)
{
    vcpu_schedule_lock_irq(v);
    if ( v->is_running )
        set_bit(_VPF_migrating, &v->pause_flags);
    vcpu_schedule_unlock_irq(v);

    if ( test_bit(_VPF_migrating, &v->pause_flags) )
    {
        vcpu_sleep_nosync(v);
        vcpu_migrate(v);
    }
}

static int __vcpu_set_affinity(
    struct vcpu *v, cpumask_t *affinity,
    bool_t old_lock_status, bool_t new_lock_status)
{
    cpumask_t online_affinity, old_affinity;

    cpus_and(online_affinity, *affinity, cpu_online_map);
    if ( cpus_empty(online_affinity) )
        return -EINVAL;

    vcpu_schedule_lock_irq(v);

    if ( v->affinity_locked != old_lock_status )
    {
        BUG_ON(!v->affinity_locked);
        vcpu_schedule_unlock_irq(v);
        return -EBUSY;
    }

    v->affinity_locked = new_lock_status;

    old_affinity = v->cpu_affinity;
    v->cpu_affinity = *affinity;
    *affinity = old_affinity;
    if ( !cpu_isset(v->processor, v->cpu_affinity) )
        set_bit(_VPF_migrating, &v->pause_flags);

    vcpu_schedule_unlock_irq(v);

    if ( test_bit(_VPF_migrating, &v->pause_flags) )
    {
        vcpu_sleep_nosync(v);
        vcpu_migrate(v);
    }

    return 0;
}

int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
{
    if ( v->domain->is_pinned )
        return -EINVAL;
    return __vcpu_set_affinity(v, affinity, 0, 0);
}

int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
{
    return __vcpu_set_affinity(v, affinity, 0, 1);
}

void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
{
    cpumask_t online_affinity;

    /* Do not fail if no CPU in old affinity mask is online. */
    cpus_and(online_affinity, *affinity, cpu_online_map);
    if ( cpus_empty(online_affinity) )
        *affinity = cpu_online_map;

    if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
        BUG();
}
#endif

/* Block the currently-executing domain until a pertinent event occurs. */
static long do_block(void)
{
    struct vcpu *v = current;

    local_event_delivery_enable();
    set_bit(_VPF_blocked, &v->pause_flags);

    /* Check for events /after/ blocking: avoids wakeup waiting race. */
    if ( local_events_need_delivery() )
    {
        clear_bit(_VPF_blocked, &v->pause_flags);
    }
    else
    {
        TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
        vcpu_raise_softirq(v, SCHEDULE_SOFTIRQ);
    }

    return 0;
}

#ifndef __KXEN__
static long do_poll(struct sched_poll *sched_poll)
{
    struct vcpu   *v = current;
    struct domain *d = v->domain;
    evtchn_port_t  port;
    long           rc = 0;
    unsigned int   i;

    /* Fairly arbitrary limit. */
    if ( sched_poll->nr_ports > 128 )
        return -EINVAL;

    if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
        return -EFAULT;

    set_bit(_VPF_blocked, &v->pause_flags);
    v->is_polling = 1;
    d->is_polling = 1;

    /* Check for events /after/ setting flags: avoids wakeup waiting race. */
    smp_wmb();

    rc = 0;
    if ( local_events_need_delivery() )
        goto out;

    for ( i = 0; i < sched_poll->nr_ports; i++ )
    {
        rc = -EFAULT;
        if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
            goto out;

        rc = -EINVAL;
        if ( port >= MAX_EVTCHNS(d) )
            goto out;

        rc = 0;
        if ( test_bit(port, &shared_info(d, evtchn_pending)) )
            goto out;
    }

    if ( sched_poll->timeout != 0 )
        set_timer(&v->poll_timer, sched_poll->timeout);

    TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
    vcpu_raise_softirq(v, SCHEDULE_SOFTIRQ);

    return 0;

 out:
    v->is_polling = 0;
    clear_bit(_VPF_blocked, &v->pause_flags);
    return rc;
}
#endif

/* Voluntarily yield the processor for this allocation. */
static long do_yield(void)
{
#ifndef __KXEN__
    struct vcpu * v=current;

    vcpu_schedule_lock_irq(v);
    SCHED_OP(yield, current);
    vcpu_schedule_unlock_irq(v);

    TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
    vcpu_raise_softirq(current, SCHEDULE_SOFTIRQ);
    return 0;
#else
    BUG();
    return 0;
#endif
}

#ifndef __KXEN__
static void watchdog_timeout(void *data)
{
    struct domain *d = data;

    printk("Watchdog timer fired for domain %u\n", d->domain_id);
    domain_shutdown(d, SHUTDOWN_watchdog);
}

static long do_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
{
    if ( id == 0 )
    {
        /* Find an unused watchdog timer and start it */
        unsigned int i;
        for ( i = 0; i < NR_WATCHDOG_TIMERS; i++ )
        {
            if ( !active_timer(&d->watchdog_timer[i]) )
            {
                init_timer(&d->watchdog_timer[i], watchdog_timeout,
                           (void *)d, 0);
                set_timer(&d->watchdog_timer[i], NOW() + SECONDS(timeout));
                /* Timer IDs count from 1, not 0 */
                return i + 1;
            }
        }
        return -EEXIST;
    }

    if ( id > NR_WATCHDOG_TIMERS )
        return -EINVAL;

    if ( !active_timer(&d->watchdog_timer[id - 1]) )
        return -EEXIST;

    if ( timeout == 0 )
        stop_timer(&d->watchdog_timer[id - 1]);
    else
        set_timer(&d->watchdog_timer[id - 1], NOW() + SECONDS(timeout));

    return 0;
}

void watchdog_domain_destroy(struct domain *d)
{
    unsigned int i;
    for ( i = 0; i < NR_WATCHDOG_TIMERS; i++ )
        kill_timer(&d->watchdog_timer[i]);
}
#endif

long do_sched_op_compat(int cmd, unsigned long arg)
{
    long ret = 0;

    switch ( cmd )
    {
    case SCHEDOP_yield:
    {
        ret = do_yield();
        break;
    }

    case SCHEDOP_block:
    {
        ret = do_block();
        break;
    }

    case SCHEDOP_shutdown:
    {
        TRACE_3D(TRC_SCHED_SHUTDOWN,
                 current->domain->domain_id, current->vcpu_id, arg);
        domain_shutdown(current->domain, (u8)arg);
        break;
    }

    default:
        ret = -ENOSYS;
    }

    return ret;
}
#ifndef __KXEN__

typedef long ret_t;

#endif
#endif /* !COMPAT */

#ifndef __KXEN__
ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE(void) arg)
{
    ret_t ret = 0;

    switch ( cmd )
    {
    case SCHEDOP_yield:
    {
        ret = do_yield();
        break;
    }

    case SCHEDOP_block:
    {
        ret = do_block();
        break;
    }

    case SCHEDOP_shutdown:
    {
        struct sched_shutdown sched_shutdown;

        ret = -EFAULT;
        if ( copy_from_guest(&sched_shutdown, arg, 1) )
            break;

        ret = 0;
        TRACE_3D(TRC_SCHED_SHUTDOWN,
                 current->domain->domain_id, current->vcpu_id,
                 sched_shutdown.reason);
        domain_shutdown(current->domain, (u8)sched_shutdown.reason);

        break;
    }

    case SCHEDOP_shutdown_code_compat:
    case SCHEDOP_shutdown_code:
    {
        struct sched_shutdown sched_shutdown;

        ret = -EFAULT;
        if ( copy_from_guest(&sched_shutdown, arg, 1) )
            break;

        ret = 0;
        TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
                 current->domain->domain_id, current->vcpu_id,
                 sched_shutdown.reason);
        spin_lock(&current->domain->shutdown_lock);
        if ( !current->domain->has_shutdown_code )
        {
            current->domain->shutdown_code = (u8)sched_shutdown.reason;
            current->domain->has_shutdown_code = 1;
        }
        spin_unlock(&current->domain->shutdown_lock);

        break;
    }

    case SCHEDOP_poll:
    {
        struct sched_poll sched_poll;

        ret = -EFAULT;
        if ( copy_from_guest(&sched_poll, arg, 1) )
            break;

        ret = do_poll(&sched_poll);

        break;
    }

    case SCHEDOP_remote_shutdown:
    {
        struct domain *d;
        struct sched_remote_shutdown sched_remote_shutdown;

        ret = -EFAULT;
        if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
            break;

        ret = -ESRCH;
        d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
        if ( d == NULL )
            break;

        if ( !IS_PRIV_FOR(current->domain, d) )
        {
            rcu_unlock_domain(d);
            return -EPERM;
        }

        ret = xsm_schedop_shutdown(current->domain, d);
        if ( ret )
        {
            rcu_unlock_domain(d);
            return ret;
        }

        domain_shutdown(d, (u8)sched_remote_shutdown.reason);

        rcu_unlock_domain(d);
        ret = 0;

        break;
    }

    case SCHEDOP_watchdog:
    {
        struct sched_watchdog sched_watchdog;

        ret = -EFAULT;
        if ( copy_from_guest(&sched_watchdog, arg, 1) )
            break;

        spin_lock(&current->domain->watchdog_lock);
        ret = do_watchdog(current->domain, sched_watchdog.id,
                          sched_watchdog.timeout);
        spin_unlock(&current->domain->watchdog_lock);

        break;
    }

    default:
        ret = -ENOSYS;
    }

    return ret;
}
#endif

#ifndef COMPAT
#ifndef __KXEN__

/* Per-vcpu oneshot-timer hypercall. */
long do_set_timer_op(s_time_t timeout)
{
    struct vcpu *v = current;
    s_time_t offset = timeout - NOW();

    if ( timeout == 0 )
    {
        stop_timer(&v->singleshot_timer);
    }
    else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
              unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
    {
        /*
         * Linux workaround: occasionally we will see timeouts a long way in 
         * the future due to wrapping in Linux's jiffy time handling. We check 
         * for timeouts wrapped negative, and for positive timeouts more than 
         * about 13 days in the future (2^50ns). The correct fix is to trigger 
         * an interrupt immediately (since Linux in fact has pending work to 
         * do in this situation). However, older guests also set a long timeout
         * when they have *no* pending timers at all: setting an immediate
         * timeout in this case can burn a lot of CPU. We therefore go for a
         * reasonable middleground of triggering a timer event in 100ms.
         */
        gdprintk(XENLOG_INFO,
                 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
                 v->vcpu_id, (uint64_t)timeout);
        set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
    }
    else
    {
        if ( v->singleshot_timer.cpu != smp_processor_id() )
        {
            stop_timer(&v->singleshot_timer);
            v->singleshot_timer.cpu = smp_processor_id();
        }

        set_timer(&v->singleshot_timer, timeout);
    }

    return 0;
}

/* sched_id - fetch ID of current scheduler */
int sched_id(void)
{
    return ops.sched_id;
}

/* Adjust scheduling parameter for a given domain. */
long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
{
    struct vcpu *v;
    long ret;
    
    if ( (op->sched_id != ops.sched_id) ||
         ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
          (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
        return -EINVAL;

    /*
     * Most VCPUs we can simply pause. If we are adjusting this VCPU then
     * we acquire the local schedule_lock to guard against concurrent updates.
     *
     * We only acquire the local schedule lock after we have paused all other
     * VCPUs in this domain. There are two reasons for this:
     * 1- We don't want to hold up interrupts as pausing a VCPU can
     *    trigger a tlb shootdown.
     * 2- Pausing other VCPUs involves briefly locking the schedule
     *    lock of the CPU they are running on. This CPU could be the
     *    same as ours.
     */

    for_each_vcpu ( d, v )
    {
        if ( v != current )
            vcpu_pause(v);
    }

    if ( d == current->domain )
        vcpu_schedule_lock_irq(current);

    if ( (ret = SCHED_OP(adjust, d, op)) == 0 )
        TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);

    if ( d == current->domain )
        vcpu_schedule_unlock_irq(current);

    for_each_vcpu ( d, v )
    {
        if ( v != current )
            vcpu_unpause(v);
    }

    return ret;
}

static void vcpu_periodic_timer_work(struct vcpu *v)
{
    s_time_t now = NOW();
    s_time_t periodic_next_event;

    ASSERT(!active_timer(&v->periodic_timer));

    if ( v->periodic_period == 0 )
        return;

    if ( v->periodic_period > SECONDS(3600) )
    {
        printk("CA-9299: dom%u vcpu%u periodic timer is %#"PRIx64"; killing\n",
               v->domain->domain_id, v->vcpu_id, (uint64_t)v->periodic_period);
        v->periodic_period = 0;
        return;
    }

    periodic_next_event = v->periodic_last_event + v->periodic_period;

    /* The timer subsystem may call us up to TIME_SLOP ahead of deadline. */
    if ( (now + TIME_SLOP) > periodic_next_event )
    {
        send_timer_event(v);
        v->periodic_last_event = now;
        periodic_next_event = now + v->periodic_period;
    }

    v->periodic_timer.cpu = smp_processor_id();
    set_timer(&v->periodic_timer, periodic_next_event);
}
#endif

static
void schedule_from_vcpu(struct vcpu *prev)
{
    s_time_t              now = NOW();
    struct schedule_data *sd;

    perfc_incr(sched_run);

    if (vcpu_runnable(prev))
        return;

    sd = &this_cpu(schedule_data);

    spin_lock_irq(&sd->schedule_lock);

    if (prev->is_running) {
        vcpu_runstate_change(prev,
                             (test_bit(_VPF_blocked, &prev->pause_flags) ?
                              RUNSTATE_blocked :
                              (vcpu_runnable(prev) ? RUNSTATE_runnable :
                               RUNSTATE_offline)),
                             now);

        prev->is_running = 0;
    }

    spin_unlock_irq(&sd->schedule_lock);
}

/* 
 * The main function
 * - deschedule the current domain (scheduler independent).
 * - pick a new domain (scheduler dependent).
 */
static void schedule(void)
{
#ifndef __KXEN__
    struct vcpu          *prev = current, *next = NULL;
    s_time_t              now = NOW();
    struct schedule_data *sd;
    struct task_slice     next_slice;
    s32                   r_time;     /* time for new dom to run */
    int                   dependent;

    ASSERT(!in_irq());
    ASSERT(this_cpu(mc_state).flags == 0);

    perfc_incr(sched_run);

    sd = &this_cpu(schedule_data);

    spin_lock_irq(&sd->schedule_lock);

    stop_timer(&sd->s_timer);
    
    /* get policy-specific decision on scheduling... */
    next_slice = ops.do_schedule(now);

    r_time = next_slice.time;
    next = next_slice.task;

    sd->curr = next;
    
    set_timer(&sd->s_timer, now + r_time);

    if ( unlikely(prev == next) )
    {
        spin_unlock_irq(&sd->schedule_lock);
        return continue_running(prev);
    }

    TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
             prev->domain->domain_id,
             now - prev->runstate.state_entry_time);
    TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
             next->domain->domain_id,
             (next->runstate.state == RUNSTATE_runnable) ?
             (now - next->runstate.state_entry_time) : 0,
             r_time);

    ASSERT(prev->runstate.state == RUNSTATE_running);
    vcpu_runstate_change(
        prev,
        (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
         (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
        now);

    ASSERT(next->runstate.state != RUNSTATE_running);
    vcpu_runstate_change(next, RUNSTATE_running, now);

    ASSERT(!next->is_running);
    next->is_running = 1;

    spin_unlock_irq(&sd->schedule_lock);

    perfc_incr(sched_ctx);

    stop_timer(&prev->periodic_timer);

    while ( unlikely(!cpus_empty(next->paused_dependent)) )
    {
        /* Wake vcpus blocked waiting for this vcpu, schedule lock NOT held. */
        dependent = first_cpu(next->paused_dependent);
        if ( (dependent != NR_CPUS) &&
              cpu_test_and_clear(dependent, next->paused_dependent) )
        {
            struct vcpu * v = next->domain->vcpu[dependent];
            vcpu_unpause(v);
            /* statistics */
            next->deferred_wake++;
            atomic_inc((atomic_t *)&v->pause_self_woken);
        }
    }
    next->dependent_pause_deferred.counter = 0;

    /* Ensure that the domain has an up-to-date time base. */
    update_vcpu_system_time(next);
    vcpu_periodic_timer_work(next);

    TRACE_4D(TRC_SCHED_SWITCH,
             prev->domain->domain_id, prev->vcpu_id,
             next->domain->domain_id, next->vcpu_id);

    context_switch(prev, next);
#else
    struct vcpu          *prev = current;

    schedule_from_vcpu(prev);
#endif
}

void
schedule_vcpu(struct vcpu *next)
{
    s_time_t              now = NOW();
    struct schedule_data *sd;
    int                   dependent;

    sd = &this_cpu(schedule_data);

    spin_lock_irq(&sd->schedule_lock);

    ASSERT(next->runstate.state != RUNSTATE_running);
    vcpu_runstate_change(next, RUNSTATE_running, now);

    ASSERT(!next->is_running);
    next->is_running = 1;

    spin_unlock_irq(&sd->schedule_lock);

    perfc_incr(sched_ctx);

    while ( unlikely(!cpus_empty(next->paused_dependent)) )
    {
        /* Wake vcpus blocked waiting for this vcpu, schedule lock NOT held. */
        dependent = first_cpu(next->paused_dependent);
        if ( (dependent != NR_CPUS) &&
              cpu_test_and_clear(dependent, next->paused_dependent) )
        {
            struct vcpu * v = next->domain->vcpu[dependent];
            vcpu_unpause(v);
            /* statistics */
            next->deferred_wake++;
            atomic_inc((atomic_t *)&v->pause_self_woken);
        }
    }
    next->dependent_pause_deferred.counter = 0;

    /* Ensure that the domain has an up-to-date time base. */
    update_vcpu_system_time(next);
    // vcpu_periodic_timer_work(next);
}

void context_saved(struct vcpu *prev)
{
    /* Clear running flag /after/ writing context to memory. */
    smp_wmb();

    prev->is_running = 0;

    /* Check for migration request /after/ clearing running flag. */
    smp_mb();

    if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) )
        vcpu_migrate(prev);
}

#ifndef __KXEN__
/* The scheduler timer: force a run through the scheduler */
static void s_timer_fn(void *unused)
{
    raise_softirq(SCHEDULE_SOFTIRQ);
    perfc_incr(sched_irq);
}
#endif

/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
static void vcpu_periodic_timer_fn(void *data)
{
#ifndef __KXEN__
    struct vcpu *v = data;
    vcpu_periodic_timer_work(v);
#else
    BUG();
#endif
}

/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
static void vcpu_singleshot_timer_fn(void *data)
{
#ifndef __KXEN__
    struct vcpu *v = data;
    send_timer_event(v);
#else
    BUG();
#endif
}

/* SCHEDOP_poll timeout callback. */
static void poll_timer_fn(void *data)
{
#ifndef __KXEN__
    struct vcpu *v = data;

    if ( !v->is_polling )
        return;

    v->is_polling = 0;
    vcpu_unblock(v);
#else
    BUG();
#endif
}

/* Initialise the data structures. */
void __init scheduler_init(void)
{
    int i;

    open_softirq(SCHEDULE_SOFTIRQ, schedule);
    open_softirq_vcpu(SCHEDULE_SOFTIRQ, schedule_from_vcpu);

    for_each_cpu ( i )
    {
        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
#ifndef __KXEN__
        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
#endif
    }

#ifndef __KXEN__
    for ( i = 0; schedulers[i] != NULL; i++ )
    {
        ops = *schedulers[i];
        if ( strcmp(ops.opt_name, opt_sched) == 0 )
            break;
    }
    
    if ( schedulers[i] == NULL )
        printk("Could not find scheduler: %s\n", opt_sched);
#else
    ops = sched_host_def;
#endif

    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
    SCHED_OP(init);
}

#ifndef __KXEN__
void dump_runq(unsigned char key)
{
    s_time_t      now = NOW();
    int           i;
    unsigned long flags;

    local_irq_save(flags);

    printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
    SCHED_OP(dump_settings);
    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);

    for_each_online_cpu ( i )
    {
        spin_lock(&per_cpu(schedule_data, i).schedule_lock);
        printk("CPU[%02d] ", i);
        SCHED_OP(dump_cpu_state, i);
        spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
    }

    local_irq_restore(flags);
}

#ifdef CONFIG_COMPAT
#include "compat/schedule.c"
#endif

#endif
#endif /* !COMPAT */

/*
 * Local variables:
 * mode: C
 * c-set-style: "BSD"
 * c-basic-offset: 4
 * tab-width: 4
 * indent-tabs-mode: nil
 * End:
 */
