Commit c3b43f25 by Máhonfai Bálint

Merge branch 'scheduler' into 'master'

Scheduler

See merge request !411
parents 7a49024a 321a6bca
Pipeline #959 passed with stage
in 0 seconds
......@@ -576,7 +576,7 @@ SESSION_COOKIE_NAME = "csessid%x" % (((getnode() // 139) ^
MAX_NODE_RAM = get_env_variable("MAX_NODE_RAM", 1024)
MAX_NODE_CPU_CORE = get_env_variable("MAX_NODE_CPU_CORE", 10)
SCHEDULER_METHOD = get_env_variable("SCHEDULER_METHOD", 'random')
SCHEDULER_METHOD = get_env_variable("SCHEDULER_METHOD", 'advanced')
# Url to download the client: (e.g. http://circlecloud.org/client/download/)
CLIENT_DOWNLOAD_URL = get_env_variable('CLIENT_DOWNLOAD_URL', 'http://circlecloud.org/client/download/')
......@@ -590,3 +590,12 @@ REQUEST_HOOK_URL = get_env_variable("REQUEST_HOOK_URL", "")
SSHKEY_EMAIL_ADD_KEY = False
TWO_FACTOR_ISSUER = get_env_variable("TWO_FACTOR_ISSUER", "CIRCLE")
# Default value is every day at midnight
AUTO_MIGRATION_CRONTAB = get_env_variable("AUTO_MIGRATION_CRONTAB", "0 0 * * *")
AUTO_MIGRATION_TIME_LIMIT_IN_HOURS = (
get_env_variable("AUTO_MIGRATION_TIME_LIMIT_IN_HOURS", "2"))
# Maximum time difference until the monitor's values get valid
SCHEDULER_TIME_SENSITIVITY_IN_SECONDS = (
get_env_variable("SCHEDULER_TIME_SENSITIVITY_IN_SECONDS", "60"))
......@@ -65,7 +65,10 @@
"modified": "2014-02-19T21:11:34.671Z",
"priority": 1,
"traits": [],
"host": 1
"host": 1,
"ram_weight": 1.0,
"cpu_weight": 1.0,
"time_stamp": "2017-12-13T21:08:08.819Z"
}
}
]
......@@ -1697,3 +1697,11 @@ class TwoFactorConfirmationForm(forms.Form):
totp = pyotp.TOTP(self.user.profile.two_factor_secret)
if not totp.verify(self.cleaned_data.get('confirmation_code')):
raise ValidationError(_("Invalid confirmation code."))
class AutoMigrationForm(forms.Form):
minute = forms.CharField()
hour = forms.CharField()
day_of_month = forms.CharField()
month_of_year = forms.CharField()
day_of_week = forms.CharField()
......@@ -1079,6 +1079,10 @@ textarea[name="new_members"] {
max-width: 100%;
}
#node-list-auto-migration-body {
padding: 20px;
}
#vm-list-table td.state,
#vm-list-table td.memory {
white-space: nowrap;
......
......@@ -3,4 +3,11 @@ $(function() {
// find disabled nodes, set danger (red) on the rows
$('.node-disabled').closest("tr").addClass('danger');
});
$('#reschedule-now').click(function() {
$.get($(this).attr('href'), function(data){
highlight = data.result === 'ok' ? 'success' : 'danger';
addMessage(data.message, highlight);
});
return false;
});
});
......@@ -41,4 +41,23 @@
</div><!-- -col-md-12 -->
</div><!-- .row -->
<div class="row">
<div class="col-md-12">
<div class="panel panel-default">
<div class="panel-heading">
<a id="reschedule-now" class="btn btn-danger pull-right" href="{% url "dashboard.views.reschedule" %}">
<i class="fa fa-magic"></i> {% trans "Reschedule now" %}
</a>
<h3 class="no-margin"><i class="fa fa-truck"></i> {% trans "Virtual machine auto migration" %}</h3>
</div>
<div id="node-list-auto-migration-body">
<h1>Crontab</h1>
<form>
{{ auto_migration_form.as_p }}
</form>
</div>
</div>
</div><!-- -col-md-12 -->
</div><!-- .row -->
{% endblock %}
......@@ -56,6 +56,7 @@ from .views import (
MessageList, MessageDetail, MessageCreate, MessageDelete,
EnableTwoFactorView, DisableTwoFactorView,
AclUserGroupAutocomplete, AclUserAutocomplete,
RescheduleView,
)
from .views.vm import vm_ops, vm_mass_ops
from .views.node import node_ops
......@@ -153,6 +154,8 @@ urlpatterns = [
r'(?P<time>[0-9]{1,2}[hdwy])$'),
NodeListGraphView.as_view(),
name='dashboard.views.node-list-graph'),
url(r'^node/reschedule/$', RescheduleView.as_view(),
name="dashboard.views.reschedule"),
url((r'^template/(?P<pk>\d+)/graph/(?P<metric>[a-z]+)/'
r'(?P<time>[0-9]{1,2}[hdwy])$'),
TemplateGraphView.as_view(),
......
......@@ -25,7 +25,7 @@ from django.core.exceptions import PermissionDenied
from django.core.urlresolvers import reverse_lazy
from django.db.models import Count
from django.forms.models import inlineformset_factory
from django.http import HttpResponse
from django.http import HttpResponse, JsonResponse
from django.shortcuts import redirect
from django.template.loader import render_to_string
from django.utils.translation import ugettext as _
......@@ -37,11 +37,14 @@ from django_tables2 import SingleTableView
from firewall.models import Host
from vm.models import Node, NodeActivity, Trait
from vm.tasks.vm_tasks import check_queue
from vm.tasks.local_periodic_tasks import auto_migrate
from ..forms import TraitForm, HostForm, NodeForm
from ..forms import TraitForm, HostForm, NodeForm, AutoMigrationForm
from ..tables import NodeListTable
from .util import AjaxOperationMixin, OperationView, GraphMixin, DeleteViewBase
from manager.mancelery import crontab_parser
def get_operations(instance, user):
ops = []
......@@ -190,6 +193,14 @@ class NodeList(LoginRequiredMixin, GraphMixin, SingleTableView):
table_class = NodeListTable
table_pagination = False
def get_crontab(self):
return crontab_parser(settings.AUTO_MIGRATION_CRONTAB)
def get_context_data(self):
context = super(NodeList, self).get_context_data()
context["auto_migration_form"] = AutoMigrationForm(self.get_crontab())
return context
def get(self, *args, **kwargs):
if not self.request.user.has_perm('vm.view_statistics'):
raise PermissionDenied()
......@@ -367,3 +378,23 @@ class NodeActivityDetail(LoginRequiredMixin, SuperuserRequiredMixin,
).order_by('-started').select_related())
ctx['icon'] = _get_activity_icon(self.object)
return ctx
class RescheduleView(SuperuserRequiredMixin, View):
def get(self, *args, **kwargs):
try:
auto_migrate.apply_async(queue='localhost.man.slow')
except Exception as e:
msg = str(e)
result = 'error'
else:
result = 'ok'
msg = _('Reschedule has started.')
if self.request.is_ajax():
return JsonResponse({'result': result, 'message': msg})
else:
if result == 'ok':
messages.success(self.request, msg)
else:
messages.error(self.request, msg)
return redirect('dashboard.views.node-list')
......@@ -17,13 +17,27 @@
from celery import Celery
from celery.signals import worker_ready
from celery.schedules import crontab
from datetime import timedelta
from celery.schedules import crontab
from kombu import Queue, Exchange
from os import getenv
HOSTNAME = "localhost"
QUEUE_NAME = HOSTNAME + '.man'
AUTO_MIGRATION_CRONTAB = getenv('AUTO_MIGRATION_CRONTAB', '0 0 * * *')
def crontab_parser(crontab):
fields = crontab.split(' ')
return dict(
minute=fields[0],
hour=fields[1],
day_of_month=fields[2],
month_of_year=fields[3],
day_of_week=fields[4],
)
celery = Celery('manager',
......@@ -56,6 +70,11 @@ celery.conf.update(
'schedule': crontab(minute=10, hour=1),
'options': {'queue': 'localhost.man'}
},
'vm.local_periodic_tasks': {
'task': 'vm.tasks.local_periodic_tasks.auto_migrate',
'schedule': crontab(**crontab_parser(AUTO_MIGRATION_CRONTAB)),
'options': {'queue': 'localhost.man.slow'},
},
}
)
......
......@@ -15,15 +15,18 @@
# You should have received a copy of the GNU General Public License along
# with CIRCLE. If not, see <http://www.gnu.org/licenses/>.
import datetime
import json
import random
from logging import getLogger
from django.conf import settings
from django.core.cache import cache
from django.utils import timezone
from django.utils.translation import ugettext_noop
from common.models import HumanReadableException
from circle.settings.base import SCHEDULER_METHOD
import random
from common.models import HumanReadableException
logger = getLogger(__name__)
......@@ -69,14 +72,14 @@ def common_select(instance, nodes):
logger.warning('select_node: no enough RAM for %s', unicode(instance))
raise NotEnoughMemoryException()
# sort nodes first by processor usage, then priority
# sort nodes first by priority
nodes.sort(key=lambda n: n.priority, reverse=True)
nodes.sort(key=free_cpu_time, reverse=True)
return nodes
def common_evenly(instance, nodes):
nodes = common_select(instance, nodes)
nodes.sort(key=free_cpu_time, reverse=True)
result = nodes[0]
return result
......@@ -87,6 +90,16 @@ def common_random(instance, nodes):
return result
def advanced_with_time_stamp(instance, nodes):
nodes = common_select(instance, nodes)
nodes.sort(key=sorting_key, reverse=True)
logger.info("SCHEDLOG: {}".format(json.dumps({
"event": "after_sort",
"list": map(lambda node: unicode(node), nodes)})))
result = nodes[0]
return result
def select_node(instance, nodes):
''' Select a node for hosting an instance based on its requirements.
'''
......@@ -94,14 +107,72 @@ def select_node(instance, nodes):
result = common_evenly(instance, nodes)
elif SCHEDULER_METHOD == 'random':
result = common_random(instance, nodes)
elif SCHEDULER_METHOD == 'advanced':
result = advanced_with_time_stamp(instance, nodes)
else: # Default method is the random
result = common_random(instance, nodes)
logger.info('Scheduler method: %s selected', unicode(SCHEDULER_METHOD))
logger.info('select_node: %s for %s', unicode(result), unicode(instance))
logger.info("SCHEDLOG: {}".format(json.dumps(
{"event": "select",
"node": unicode(result),
"vm": unicode(instance)})))
set_time_stamp(result)
return result
def sorting_key(node):
"""Determines how valuable a node is for scheduling.
"""
key = 0
corr = last_scheduled_correction_factor(node)
if free_cpu_time(node) < free_ram(node):
key = free_cpu_time(node) * corr
else:
key = free_ram(node) * corr
logger.info("SCHEDLOG: {}".format(json.dumps({
"event": "sort",
"node": unicode(node),
"sorting_key": unicode(key),
"free_cpu_time": unicode(free_cpu_time(node)),
"free_ram": unicode(free_ram(node)),
"last_scheduled_correction_factor": unicode(last_scheduled_correction_factor(node))})))
return key
def set_time_stamp(node):
cache.set('time_stamp{}'.format(node.id), timezone.now())
def get_time_stamp(node):
time_stamp = cache.get('time_stamp{}'.format(node.id))
if time_stamp:
return time_stamp
return datetime.datetime(1970, 1, 1, tzinfo=timezone.get_current_timezone())
def last_scheduled_correction_factor(node):
"""Returns the time correction factor for a node.
The monitor data may be outdated, because of recent scheduling for a given node.
The return value is between 0 and 1, higher value indicates more time since the
last scheduling for the given node.
"""
factor = 0
max_time_diff = settings.SCHEDULER_TIME_SENSITIVITY_IN_SECONDS
current_time = timezone.now()
time_difference_in_seconds = (
current_time - get_time_stamp(node)).total_seconds()
factor = time_difference_in_seconds/float(max_time_diff)
if factor > 1:
factor = 1
elif factor < 0:
factor = 1
logger.info('Scheduler set factor to %s', unicode(factor))
return factor
def has_traits(traits, node):
"""True, if the node has all specified traits; otherwise, false.
"""
......@@ -142,11 +213,27 @@ def free_cpu_time(node):
Higher values indicate more idle time.
"""
try:
activity = node.cpu_usage / 100
inactivity = 1 - activity
cores = node.num_cores
return cores * inactivity
free_cpu_percent = 1 - node.cpu_usage
weight = node.cpu_weight
weighted_value = free_cpu_percent * weight
return weighted_value
except TypeError as e:
logger.exception('Got incorrect monitoring data for node %s. %s',
unicode(node), unicode(e))
return 0 # will result lowest priority
def free_ram(node):
"""Get an indicator number for free RAM on the node.
Higher value indicates more RAM.
"""
try:
free_ram_percent = 1 - node.ram_usage
weight = node.ram_weight
weighted_value = free_ram_percent * weight
return weighted_value
except TypeError as e:
logger.warning('Got incorrect monitoring data for node %s. %s',
unicode(node), unicode(e))
return False # monitoring data is incorrect
logger.exception('Got incorrect monitoring data for node %s. %s',
unicode(node), unicode(e))
return 0 # will result lowest priority
# -*- coding: utf-8 -*-
# Generated by Django 1.11.6 on 2017-12-13 20:18
from __future__ import unicode_literals
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('vm', '0002_interface_model'),
]
operations = [
migrations.AddField(
model_name='node',
name='cpu_weight',
field=models.FloatField(default=1.0, help_text='Indicates the relative CPU power of this node.', verbose_name='CPU Weight'),
),
migrations.AddField(
model_name='node',
name='ram_weight',
field=models.FloatField(default=1.0, help_text='Indicates the relative RAM quantity of this node.', verbose_name='RAM Weight'),
),
migrations.AddField(
model_name='node',
name='time_stamp',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now, help_text='A timestamp for the node, used by the scheduler.', verbose_name='Last Scheduled Time Stamp'),
preserve_default=False,
),
]
......@@ -865,6 +865,53 @@ class Instance(AclBase, VirtualMachineDescModel, StatusModel, OperatedMixin,
def metric_prefix(self):
return 'vm.%s' % self.vm_name
class MonitorUnavailableException(Exception):
"""Exception for monitor_info()
Indicates the unavailability of the monitoring server.
"""
pass
def monitor_info(self):
metrics = ('cpu.percent', 'memory.usage')
prefix = self.metric_prefix
params = [('target', '%s.%s' % (prefix, metric))
for metric in metrics]
params.append(('from', '-5min'))
params.append(('format', 'json'))
try:
logger.info('%s %s', settings.GRAPHITE_URL, params)
response = requests.get(settings.GRAPHITE_URL, params=params)
retval = {}
for target in response.json():
# Example:
# {"target": "circle.vm.{name}.cpu.usage",
# "datapoints": [[0.6, 1403045700], [0.5, 1403045760]
try:
metric = target['target']
if metric.startswith(prefix):
metric = metric[len(prefix):]
else:
continue
value = target['datapoints'][-2][0]
retval[metric] = float(value)
except (KeyError, IndexError, ValueError):
continue
return retval
except Exception:
logger.exception('Monitor server unavailable: ')
raise Instance.MonitorUnavailableException()
def cpu_usage(self):
return self.monitor_info().get('cpu.percent')
def ram_usage(self):
return self.monitor_info().get('memory.usage')
@contextmanager
def activity(self, code_suffix, readable_name, on_abort=None,
on_commit=None, task_uuid=None, user=None,
......
......@@ -30,7 +30,7 @@ from time import time, sleep
from django.conf import settings
from django.db.models import (
CharField, IntegerField, ForeignKey, BooleanField, ManyToManyField,
FloatField, permalink, Sum
FloatField, DateTimeField, permalink, Sum
)
from django.utils import timezone
from django.utils.translation import ugettext_lazy as _
......@@ -128,11 +128,15 @@ class Node(OperatedMixin, TimeStampedModel):
enabled = BooleanField(verbose_name=_('enabled'), default=False,
help_text=_('Indicates whether the node can '
'be used for hosting.'))
schedule_enabled = BooleanField(verbose_name=_('schedule enabled'),
default=False, help_text=_(
'Indicates whether a vm can be '
'automatically scheduled to this '
'node.'))
schedule_enabled = BooleanField(
verbose_name=_('schedule enabled'),
default=False,
help_text=_(
'Indicates whether a vm can be '
'automatically scheduled to this '
'node.'
)
)
traits = ManyToManyField(Trait, blank=True,
help_text=_("Declared traits."),
verbose_name=_('traits'))
......@@ -140,6 +144,21 @@ class Node(OperatedMixin, TimeStampedModel):
overcommit = FloatField(default=1.0, verbose_name=_("overcommit ratio"),
help_text=_("The ratio of total memory with "
"to without overcommit."))
ram_weight = FloatField(
default=1.0,
help_text=_("Indicates the relative RAM quantity of this node."),
verbose_name=_("RAM Weight")
)
cpu_weight = FloatField(
default=1.0,
help_text=_("Indicates the relative CPU power of this node."),
verbose_name=_("CPU Weight")
)
time_stamp = DateTimeField(
auto_now_add=True,
help_text=_("A timestamp for the node, used by the scheduler."),
verbose_name=_("Last Scheduled Time Stamp")
)
class Meta:
app_label = 'vm'
......@@ -162,7 +181,7 @@ class Node(OperatedMixin, TimeStampedModel):
self.get_remote_queue_name("vm", "fast")
self.get_remote_queue_name("vm", "slow")
self.get_remote_queue_name("net", "fast")
except:
except Exception:
return False
else:
return True
......@@ -353,7 +372,7 @@ class Node(OperatedMixin, TimeStampedModel):
continue
return retval
except:
except Exception:
logger.exception('Unhandled exception: ')
return self.remote_query(vm_tasks.get_node_metrics, timeout=30,
priority="fast")
......@@ -425,7 +444,7 @@ class Node(OperatedMixin, TimeStampedModel):
# [{'name': 'cloud-1234', 'state': 'RUNNING', ...}, ...]
try:
id = int(i['name'].split('-')[1])
except:
except Exception:
pass # name format doesn't match
else:
domains[id] = i['state']
......
......@@ -17,7 +17,9 @@
import logging
from django.utils import timezone
from datetime import timedelta
from django.utils.translation import ugettext_noop
from django.conf import settings
from manager.mancelery import celery
from vm.models import Node, Instance
......@@ -78,3 +80,43 @@ def garbage_collector(timeout=15):
i.notify_owners_about_expiration()
else:
logger.debug("Instance %d didn't expire." % i.pk)
@celery.task(ignore_result=True)
def auto_migrate():
"""Auto migration task for runtime scaling
"""
time_limit = settings.AUTO_MIGRATION_TIME_LIMIT_IN_HOURS
available_time = timedelta(hours=int(time_limit))
deadline = timezone.now() + available_time
while timezone.now() < deadline:
migrate_one()
def migrate_one():
"""Migrate a VM syncronously.
The target node chosen by the scheduler.
"""
nodes = [n for n in Node.objects.filter(enabled=True) if n.online]
node_max_cpu = max(nodes, key=lambda x: x.cpu_usage / x.cpu_weight)
node_max_ram = max(nodes, key=lambda x: x.ram_usage / x.ram_weight)
if node_max_cpu.cpu_usage > node_max_ram.ram_usage:
try:
instance_to_migrate = max(Instance.objects.filter(node=node_max_cpu.pk),
key=lambda x: x.cpu_usage())
instance_to_migrate.migrate(system=True)
except Instance.MonitorUnavailableException:
instance_to_migrate = max(Instance.objects.filter(node=node_max_cpu.pk),
key=(lambda x: x.get_vm_desc()["vcpu"] *
x.get_vm_desc()["cpu_share"]))
instance_to_migrate.migrate(system=True)
else:
try:
instance_to_migrate = max(Instance.objects.filter(node=node_max_ram.pk),
key=lambda x: x.ram_usage())
instance_to_migrate.migrate(system=True)
except Instance.MonitorUnavailableException:
instance_to_migrate = max(Instance.objects.filter(node=node_max_cpu.pk),
key=lambda x: x.get_vm_desc()["memory"])
instance_to_migrate.migrate(system=True)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment