scheduler.py 7.64 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# Copyright 2014 Budapest University of Technology and Economics (BME IK)
#
# This file is part of CIRCLE Cloud.
#
# CIRCLE is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# CIRCLE is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with CIRCLE.  If not, see <http://www.gnu.org/licenses/>.

18
import datetime
19 20 21
import json
import random
from logging import getLogger
22

23
from django.conf import settings
24
from django.core.cache import cache
25
from django.utils import timezone
26
from django.utils.translation import ugettext_noop
27

28
from circle.settings.base import SCHEDULER_METHOD
29
from common.models import HumanReadableException
30

31 32
logger = getLogger(__name__)

33

34 35
class SchedulerError(HumanReadableException):
    admin_message = None
36

37 38 39 40 41
    def __init__(self, params=None, level=None, **kwargs):
        kwargs.update(params or {})
        super(SchedulerError, self).__init__(
            level, self.message, self.admin_message or self.message,
            kwargs)
42 43


44 45 46 47
class NotEnoughMemoryException(SchedulerError):
    message = ugettext_noop(
        "The resources required for launching the virtual machine are not "
        "available currently. Please try again later.")
48

49 50 51
    admin_message = ugettext_noop(
        "The required free memory for launching the virtual machine is not "
        "available on any usable node currently. Please try again later.")
52

53

54 55 56
class TraitsUnsatisfiableException(SchedulerError):
    message = ugettext_noop(
        "No node can satisfy the required traits of the "
57
        "new virtual machine currently.")
58 59


60
def common_select(instance, nodes):
61
    # check required traits
62
    nodes = [n for n in nodes
63 64
             if n.schedule_enabled and n.online and
             has_traits(instance.req_traits.all(), n)]
65
    if not nodes:
66
        logger.warning('select_node: no usable node for %s', unicode(instance))
67 68 69 70 71
        raise TraitsUnsatisfiableException()

    # check required RAM
    nodes = [n for n in nodes if has_enough_ram(instance.ram_size, n)]
    if not nodes:
72
        logger.warning('select_node: no enough RAM for %s', unicode(instance))
73 74
        raise NotEnoughMemoryException()

75
    # sort nodes first by priority
76
    nodes.sort(key=lambda n: n.priority, reverse=True)
77 78 79 80 81
    return nodes


def common_evenly(instance, nodes):
    nodes = common_select(instance, nodes)
82
    nodes.sort(key=free_cpu_time, reverse=True)
83
    result = nodes[0]
84
    return result
85

86 87 88 89 90 91

def common_random(instance, nodes):
    nodes = common_select(instance, nodes)
    result = random.choice(nodes)
    return result

92

93 94
def advanced_with_time_stamp(instance, nodes):
    nodes = common_select(instance, nodes)
95
    nodes.sort(key=sorting_key, reverse=True)
96 97 98
    logger.info("SCHEDLOG: {}".format(json.dumps({
        "event": "after_sort", 
        "list": map(lambda node: unicode(node), nodes)})))
99
    result = nodes[0]
100
    return result
101

102

103 104 105 106 107 108 109
def select_node(instance, nodes):
    ''' Select a node for hosting an instance based on its requirements.
    '''
    if SCHEDULER_METHOD == 'evenly':
        result = common_evenly(instance, nodes)
    elif SCHEDULER_METHOD == 'random':
        result = common_random(instance, nodes)
110 111
    elif SCHEDULER_METHOD == 'advanced':
        result = advanced_with_time_stamp(instance, nodes)
112 113 114
    else:  # Default method is the random
        result = common_random(instance, nodes)

115 116 117 118
    logger.info("SCHEDLOG: {}".format(json.dumps(
        {"event": "select",
         "node": unicode(result),
          "vm": unicode(instance)})))
119

120
    set_time_stamp(result)
121

122
    return result
123 124


125 126 127
def sorting_key(node):
    """Determines how valuable a node is for scheduling.
    """
128
    key = 0
Belákovics Ádám committed
129
    corr = last_scheduled_correction_factor(node)
130
    if free_cpu_time(node) < free_ram(node):
Belákovics Ádám committed
131
        key = free_cpu_time(node) * corr
132
    else:
Belákovics Ádám committed
133
        key = free_ram(node) * corr
134 135 136 137 138 139 140
    logger.info("SCHEDLOG: {}".format(json.dumps({
        "event": "sort",
        "node": unicode(node),
        "sorting_key": unicode(key),
        "free_cpu_time": unicode(free_cpu_time(node)),
        "free_ram": unicode(free_ram(node)),
        "last_scheduled_correction_factor": unicode(last_scheduled_correction_factor(node))})))
141
    return key
142

143

144 145 146
def set_time_stamp(node):
    cache.set('time_stamp{}'.format(node.id), timezone.now())

147

148 149 150 151
def get_time_stamp(node):
    time_stamp = cache.get('time_stamp{}'.format(node.id))
    if time_stamp:
        return time_stamp
152 153
    return datetime.datetime(1970, 1, 1, tzinfo=timezone.get_current_timezone())

154

155 156 157 158 159 160 161 162 163 164
def last_scheduled_correction_factor(node):
    """Returns the time correction factor for a node.

    The monitor data may be outdated, because of recent scheduling for a given node.
    The return value is between 0 and 1, higher value indicates more time since the
    last scheduling for the given node.
    """
    factor = 0
    max_time_diff = settings.SCHEDULER_TIME_SENSITIVITY_IN_SECONDS
    current_time = timezone.now()
165 166
    time_difference_in_seconds = (
        current_time - get_time_stamp(node)).total_seconds()
Belákovics Ádám committed
167
    factor = time_difference_in_seconds/float(max_time_diff)
168 169 170
    if factor > 1:
        factor = 1
    elif factor < 0:
171 172
        factor = 1
    logger.info('Scheduler set factor to %s', unicode(factor))
173
    return factor
174

175

176 177 178 179 180 181 182 183 184 185 186
def has_traits(traits, node):
    """True, if the node has all specified traits; otherwise, false.
    """
    traits = set(traits)
    return traits.issubset(node.traits.all())


def has_enough_ram(ram_size, node):
    """True, if the node has enough memory to accomodate a guest requiring
       ram_size mebibytes of memory; otherwise, false.
    """
187
    ram_size = ram_size * 1024 * 1024
188 189
    try:
        total = node.ram_size
190
        used = node.byte_ram_usage
191
        unused = total - used
192

193
        overcommit = node.ram_size_with_overcommit
194
        reserved = node.allocated_ram
195
        free = overcommit - reserved
196

197 198 199 200 201 202 203
        retval = ram_size < unused and ram_size < free

        logger.debug('has_enough_ram(%d, %s)=%s (total=%s unused=%s'
                     ' overcommit=%s free=%s free_ok=%s overcommit_ok=%s)',
                     ram_size, node, retval, total, unused, overcommit, free,
                     ram_size < unused, ram_size < free)
        return retval
204
    except TypeError as e:
205 206
        logger.exception('Got incorrect monitoring data for node %s. %s',
                         unicode(node), unicode(e))
207
        return False
208 209


210 211
def free_cpu_time(node):
    """Get an indicator number for idle processor time on the node.
212

213
    Higher values indicate more idle time.
214
    """
215
    try:
216 217 218 219
        free_cpu_percent = 1 - node.cpu_usage
        weight = node.cpu_weight
        weighted_value = free_cpu_percent * weight
        return weighted_value
220
    except TypeError as e:
221
        logger.exception('Got incorrect monitoring data for node %s. %s',
222 223
                         unicode(node), unicode(e))
        return 0  # will result lowest priority
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238


def free_ram(node):
    """Get an indicator number for free RAM on the node.

    Higher value indicates more RAM.
    """
    try:
        free_ram_percent = 1 - node.ram_usage
        weight = node.ram_weight
        weighted_value = free_ram_percent * weight
        return weighted_value
    except TypeError as e:
        logger.exception('Got incorrect monitoring data for node %s. %s',
                         unicode(node), unicode(e))
239
        return 0  # will result lowest priority