Commit 6b634da4 by Máhonfai Bálint

Merge remote-tracking branch 'origin/store_fix' into export_import_disk

parents b560c0d5 205cb52f
...@@ -576,7 +576,7 @@ SESSION_COOKIE_NAME = "csessid%x" % (((getnode() // 139) ^ ...@@ -576,7 +576,7 @@ SESSION_COOKIE_NAME = "csessid%x" % (((getnode() // 139) ^
MAX_NODE_RAM = get_env_variable("MAX_NODE_RAM", 1024) MAX_NODE_RAM = get_env_variable("MAX_NODE_RAM", 1024)
MAX_NODE_CPU_CORE = get_env_variable("MAX_NODE_CPU_CORE", 10) MAX_NODE_CPU_CORE = get_env_variable("MAX_NODE_CPU_CORE", 10)
SCHEDULER_METHOD = get_env_variable("SCHEDULER_METHOD", 'random') SCHEDULER_METHOD = get_env_variable("SCHEDULER_METHOD", 'advanced')
# Url to download the client: (e.g. http://circlecloud.org/client/download/) # Url to download the client: (e.g. http://circlecloud.org/client/download/)
CLIENT_DOWNLOAD_URL = get_env_variable('CLIENT_DOWNLOAD_URL', 'http://circlecloud.org/client/download/') CLIENT_DOWNLOAD_URL = get_env_variable('CLIENT_DOWNLOAD_URL', 'http://circlecloud.org/client/download/')
...@@ -590,3 +590,12 @@ REQUEST_HOOK_URL = get_env_variable("REQUEST_HOOK_URL", "") ...@@ -590,3 +590,12 @@ REQUEST_HOOK_URL = get_env_variable("REQUEST_HOOK_URL", "")
SSHKEY_EMAIL_ADD_KEY = False SSHKEY_EMAIL_ADD_KEY = False
TWO_FACTOR_ISSUER = get_env_variable("TWO_FACTOR_ISSUER", "CIRCLE") TWO_FACTOR_ISSUER = get_env_variable("TWO_FACTOR_ISSUER", "CIRCLE")
# Default value is every day at midnight
AUTO_MIGRATION_CRONTAB = get_env_variable("AUTO_MIGRATION_CRONTAB", "0 0 * * *")
AUTO_MIGRATION_TIME_LIMIT_IN_HOURS = (
get_env_variable("AUTO_MIGRATION_TIME_LIMIT_IN_HOURS", "2"))
# Maximum time difference until the monitor's values get valid
SCHEDULER_TIME_SENSITIVITY_IN_SECONDS = (
get_env_variable("SCHEDULER_TIME_SENSITIVITY_IN_SECONDS", "60"))
...@@ -65,7 +65,10 @@ ...@@ -65,7 +65,10 @@
"modified": "2014-02-19T21:11:34.671Z", "modified": "2014-02-19T21:11:34.671Z",
"priority": 1, "priority": 1,
"traits": [], "traits": [],
"host": 1 "host": 1,
"ram_weight": 1.0,
"cpu_weight": 1.0,
"time_stamp": "2017-12-13T21:08:08.819Z"
} }
} }
] ]
...@@ -1734,3 +1734,11 @@ class TwoFactorConfirmationForm(forms.Form): ...@@ -1734,3 +1734,11 @@ class TwoFactorConfirmationForm(forms.Form):
totp = pyotp.TOTP(self.user.profile.two_factor_secret) totp = pyotp.TOTP(self.user.profile.two_factor_secret)
if not totp.verify(self.cleaned_data.get('confirmation_code')): if not totp.verify(self.cleaned_data.get('confirmation_code')):
raise ValidationError(_("Invalid confirmation code.")) raise ValidationError(_("Invalid confirmation code."))
class AutoMigrationForm(forms.Form):
minute = forms.CharField()
hour = forms.CharField()
day_of_month = forms.CharField()
month_of_year = forms.CharField()
day_of_week = forms.CharField()
...@@ -1079,6 +1079,10 @@ textarea[name="new_members"] { ...@@ -1079,6 +1079,10 @@ textarea[name="new_members"] {
max-width: 100%; max-width: 100%;
} }
#node-list-auto-migration-body {
padding: 20px;
}
#vm-list-table td.state, #vm-list-table td.state,
#vm-list-table td.memory { #vm-list-table td.memory {
white-space: nowrap; white-space: nowrap;
......
...@@ -3,4 +3,11 @@ $(function() { ...@@ -3,4 +3,11 @@ $(function() {
// find disabled nodes, set danger (red) on the rows // find disabled nodes, set danger (red) on the rows
$('.node-disabled').closest("tr").addClass('danger'); $('.node-disabled').closest("tr").addClass('danger');
}); });
$('#reschedule-now').click(function() {
$.get($(this).attr('href'), function(data){
highlight = data.result === 'ok' ? 'success' : 'danger';
addMessage(data.message, highlight);
});
return false;
});
}); });
...@@ -44,18 +44,12 @@ class NoStoreException(StoreApiException): ...@@ -44,18 +44,12 @@ class NoStoreException(StoreApiException):
pass pass
class NoOrgIdException(StoreApiException):
pass
class Store(object): class Store(object):
def __init__(self, user, default_timeout=0.5): def __init__(self, user, default_timeout=0.5):
self.store_url = settings.STORE_URL self.store_url = settings.STORE_URL
if not self.store_url: if not self.store_url or not user.profile.org_id:
raise NoStoreException raise NoStoreException
if not user.profile.org_id:
raise NoOrgIdException
self.username = 'u-%s' % user.profile.org_id self.username = 'u-%s' % user.profile.org_id
self.request_args = {'verify': settings.STORE_VERIFY_SSL} self.request_args = {'verify': settings.STORE_VERIFY_SSL}
if settings.STORE_SSL_AUTH: if settings.STORE_SSL_AUTH:
......
...@@ -41,4 +41,23 @@ ...@@ -41,4 +41,23 @@
</div><!-- -col-md-12 --> </div><!-- -col-md-12 -->
</div><!-- .row --> </div><!-- .row -->
<div class="row">
<div class="col-md-12">
<div class="panel panel-default">
<div class="panel-heading">
<a id="reschedule-now" class="btn btn-danger pull-right" href="{% url "dashboard.views.reschedule" %}">
<i class="fa fa-magic"></i> {% trans "Reschedule now" %}
</a>
<h3 class="no-margin"><i class="fa fa-truck"></i> {% trans "Virtual machine auto migration" %}</h3>
</div>
<div id="node-list-auto-migration-body">
<h1>Crontab</h1>
<form>
{{ auto_migration_form.as_p }}
</form>
</div>
</div>
</div><!-- -col-md-12 -->
</div><!-- .row -->
{% endblock %} {% endblock %}
...@@ -56,6 +56,7 @@ from .views import ( ...@@ -56,6 +56,7 @@ from .views import (
MessageList, MessageDetail, MessageCreate, MessageDelete, MessageList, MessageDetail, MessageCreate, MessageDelete,
EnableTwoFactorView, DisableTwoFactorView, EnableTwoFactorView, DisableTwoFactorView,
AclUserGroupAutocomplete, AclUserAutocomplete, AclUserGroupAutocomplete, AclUserAutocomplete,
RescheduleView,
) )
from .views.vm import vm_ops, vm_mass_ops from .views.vm import vm_ops, vm_mass_ops
from .views.node import node_ops from .views.node import node_ops
...@@ -153,6 +154,8 @@ urlpatterns = [ ...@@ -153,6 +154,8 @@ urlpatterns = [
r'(?P<time>[0-9]{1,2}[hdwy])$'), r'(?P<time>[0-9]{1,2}[hdwy])$'),
NodeListGraphView.as_view(), NodeListGraphView.as_view(),
name='dashboard.views.node-list-graph'), name='dashboard.views.node-list-graph'),
url(r'^node/reschedule/$', RescheduleView.as_view(),
name="dashboard.views.reschedule"),
url((r'^template/(?P<pk>\d+)/graph/(?P<metric>[a-z]+)/' url((r'^template/(?P<pk>\d+)/graph/(?P<metric>[a-z]+)/'
r'(?P<time>[0-9]{1,2}[hdwy])$'), r'(?P<time>[0-9]{1,2}[hdwy])$'),
TemplateGraphView.as_view(), TemplateGraphView.as_view(),
......
...@@ -25,7 +25,7 @@ from django.core.exceptions import PermissionDenied ...@@ -25,7 +25,7 @@ from django.core.exceptions import PermissionDenied
from django.core.urlresolvers import reverse_lazy from django.core.urlresolvers import reverse_lazy
from django.db.models import Count from django.db.models import Count
from django.forms.models import inlineformset_factory from django.forms.models import inlineformset_factory
from django.http import HttpResponse from django.http import HttpResponse, JsonResponse
from django.shortcuts import redirect from django.shortcuts import redirect
from django.template.loader import render_to_string from django.template.loader import render_to_string
from django.utils.translation import ugettext as _ from django.utils.translation import ugettext as _
...@@ -37,11 +37,14 @@ from django_tables2 import SingleTableView ...@@ -37,11 +37,14 @@ from django_tables2 import SingleTableView
from firewall.models import Host from firewall.models import Host
from vm.models import Node, NodeActivity, Trait from vm.models import Node, NodeActivity, Trait
from vm.tasks.vm_tasks import check_queue from vm.tasks.vm_tasks import check_queue
from vm.tasks.local_periodic_tasks import auto_migrate
from ..forms import TraitForm, HostForm, NodeForm from ..forms import TraitForm, HostForm, NodeForm, AutoMigrationForm
from ..tables import NodeListTable from ..tables import NodeListTable
from .util import AjaxOperationMixin, OperationView, GraphMixin, DeleteViewBase from .util import AjaxOperationMixin, OperationView, GraphMixin, DeleteViewBase
from manager.mancelery import crontab_parser
def get_operations(instance, user): def get_operations(instance, user):
ops = [] ops = []
...@@ -190,6 +193,14 @@ class NodeList(LoginRequiredMixin, GraphMixin, SingleTableView): ...@@ -190,6 +193,14 @@ class NodeList(LoginRequiredMixin, GraphMixin, SingleTableView):
table_class = NodeListTable table_class = NodeListTable
table_pagination = False table_pagination = False
def get_crontab(self):
return crontab_parser(settings.AUTO_MIGRATION_CRONTAB)
def get_context_data(self):
context = super(NodeList, self).get_context_data()
context["auto_migration_form"] = AutoMigrationForm(self.get_crontab())
return context
def get(self, *args, **kwargs): def get(self, *args, **kwargs):
if not self.request.user.has_perm('vm.view_statistics'): if not self.request.user.has_perm('vm.view_statistics'):
raise PermissionDenied() raise PermissionDenied()
...@@ -367,3 +378,23 @@ class NodeActivityDetail(LoginRequiredMixin, SuperuserRequiredMixin, ...@@ -367,3 +378,23 @@ class NodeActivityDetail(LoginRequiredMixin, SuperuserRequiredMixin,
).order_by('-started').select_related()) ).order_by('-started').select_related())
ctx['icon'] = _get_activity_icon(self.object) ctx['icon'] = _get_activity_icon(self.object)
return ctx return ctx
class RescheduleView(SuperuserRequiredMixin, View):
def get(self, *args, **kwargs):
try:
auto_migrate.apply_async(queue='localhost.man.slow')
except Exception as e:
msg = str(e)
result = 'error'
else:
result = 'ok'
msg = _('Reschedule has started.')
if self.request.is_ajax():
return JsonResponse({'result': result, 'message': msg})
else:
if result == 'ok':
messages.success(self.request, msg)
else:
messages.error(self.request, msg)
return redirect('dashboard.views.node-list')
...@@ -36,7 +36,7 @@ from django.views.generic import TemplateView ...@@ -36,7 +36,7 @@ from django.views.generic import TemplateView
from braces.views import LoginRequiredMixin from braces.views import LoginRequiredMixin
from ..store_api import (Store, NoStoreException, from ..store_api import (Store, NoStoreException,
NotOkException, NoOrgIdException) NotOkException)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -71,11 +71,6 @@ class StoreList(LoginRequiredMixin, TemplateView): ...@@ -71,11 +71,6 @@ class StoreList(LoginRequiredMixin, TemplateView):
return super(StoreList, self).get(*args, **kwargs) return super(StoreList, self).get(*args, **kwargs)
except NoStoreException: except NoStoreException:
messages.warning(self.request, _("No store.")) messages.warning(self.request, _("No store."))
except NoOrgIdException:
messages.warning(self.request,
_("Your organization ID is not set."
" To use the store, you need a"
" unique organization ID."))
except NotOkException: except NotOkException:
messages.warning(self.request, _("Store has some problems now." messages.warning(self.request, _("Store has some problems now."
" Try again later.")) " Try again later."))
......
...@@ -17,13 +17,27 @@ ...@@ -17,13 +17,27 @@
from celery import Celery from celery import Celery
from celery.signals import worker_ready from celery.signals import worker_ready
from celery.schedules import crontab
from datetime import timedelta from datetime import timedelta
from celery.schedules import crontab from celery.schedules import crontab
from kombu import Queue, Exchange from kombu import Queue, Exchange
from os import getenv from os import getenv
HOSTNAME = "localhost" HOSTNAME = "localhost"
QUEUE_NAME = HOSTNAME + '.man' QUEUE_NAME = HOSTNAME + '.man'
AUTO_MIGRATION_CRONTAB = getenv('AUTO_MIGRATION_CRONTAB', '0 0 * * *')
def crontab_parser(crontab):
fields = crontab.split(' ')
return dict(
minute=fields[0],
hour=fields[1],
day_of_month=fields[2],
month_of_year=fields[3],
day_of_week=fields[4],
)
celery = Celery('manager', celery = Celery('manager',
...@@ -56,6 +70,11 @@ celery.conf.update( ...@@ -56,6 +70,11 @@ celery.conf.update(
'schedule': crontab(minute=10, hour=1), 'schedule': crontab(minute=10, hour=1),
'options': {'queue': 'localhost.man'} 'options': {'queue': 'localhost.man'}
}, },
'vm.local_periodic_tasks': {
'task': 'vm.tasks.local_periodic_tasks.auto_migrate',
'schedule': crontab(**crontab_parser(AUTO_MIGRATION_CRONTAB)),
'options': {'queue': 'localhost.man.slow'},
},
} }
) )
......
...@@ -15,15 +15,18 @@ ...@@ -15,15 +15,18 @@
# You should have received a copy of the GNU General Public License along # You should have received a copy of the GNU General Public License along
# with CIRCLE. If not, see <http://www.gnu.org/licenses/>. # with CIRCLE. If not, see <http://www.gnu.org/licenses/>.
import datetime
import json
import random
from logging import getLogger from logging import getLogger
from django.conf import settings
from django.core.cache import cache
from django.utils import timezone
from django.utils.translation import ugettext_noop from django.utils.translation import ugettext_noop
from common.models import HumanReadableException
from circle.settings.base import SCHEDULER_METHOD from circle.settings.base import SCHEDULER_METHOD
from common.models import HumanReadableException
import random
logger = getLogger(__name__) logger = getLogger(__name__)
...@@ -69,14 +72,14 @@ def common_select(instance, nodes): ...@@ -69,14 +72,14 @@ def common_select(instance, nodes):
logger.warning('select_node: no enough RAM for %s', unicode(instance)) logger.warning('select_node: no enough RAM for %s', unicode(instance))
raise NotEnoughMemoryException() raise NotEnoughMemoryException()
# sort nodes first by processor usage, then priority # sort nodes first by priority
nodes.sort(key=lambda n: n.priority, reverse=True) nodes.sort(key=lambda n: n.priority, reverse=True)
nodes.sort(key=free_cpu_time, reverse=True)
return nodes return nodes
def common_evenly(instance, nodes): def common_evenly(instance, nodes):
nodes = common_select(instance, nodes) nodes = common_select(instance, nodes)
nodes.sort(key=free_cpu_time, reverse=True)
result = nodes[0] result = nodes[0]
return result return result
...@@ -87,6 +90,16 @@ def common_random(instance, nodes): ...@@ -87,6 +90,16 @@ def common_random(instance, nodes):
return result return result
def advanced_with_time_stamp(instance, nodes):
nodes = common_select(instance, nodes)
nodes.sort(key=sorting_key, reverse=True)
logger.info("SCHEDLOG: {}".format(json.dumps({
"event": "after_sort",
"list": map(lambda node: unicode(node), nodes)})))
result = nodes[0]
return result
def select_node(instance, nodes): def select_node(instance, nodes):
''' Select a node for hosting an instance based on its requirements. ''' Select a node for hosting an instance based on its requirements.
''' '''
...@@ -94,14 +107,72 @@ def select_node(instance, nodes): ...@@ -94,14 +107,72 @@ def select_node(instance, nodes):
result = common_evenly(instance, nodes) result = common_evenly(instance, nodes)
elif SCHEDULER_METHOD == 'random': elif SCHEDULER_METHOD == 'random':
result = common_random(instance, nodes) result = common_random(instance, nodes)
elif SCHEDULER_METHOD == 'advanced':
result = advanced_with_time_stamp(instance, nodes)
else: # Default method is the random else: # Default method is the random
result = common_random(instance, nodes) result = common_random(instance, nodes)
logger.info('Scheduler method: %s selected', unicode(SCHEDULER_METHOD)) logger.info("SCHEDLOG: {}".format(json.dumps(
logger.info('select_node: %s for %s', unicode(result), unicode(instance)) {"event": "select",
"node": unicode(result),
"vm": unicode(instance)})))
set_time_stamp(result)
return result return result
def sorting_key(node):
"""Determines how valuable a node is for scheduling.
"""
key = 0
corr = last_scheduled_correction_factor(node)
if free_cpu_time(node) < free_ram(node):
key = free_cpu_time(node) * corr
else:
key = free_ram(node) * corr
logger.info("SCHEDLOG: {}".format(json.dumps({
"event": "sort",
"node": unicode(node),
"sorting_key": unicode(key),
"free_cpu_time": unicode(free_cpu_time(node)),
"free_ram": unicode(free_ram(node)),
"last_scheduled_correction_factor": unicode(last_scheduled_correction_factor(node))})))
return key
def set_time_stamp(node):
cache.set('time_stamp{}'.format(node.id), timezone.now())
def get_time_stamp(node):
time_stamp = cache.get('time_stamp{}'.format(node.id))
if time_stamp:
return time_stamp
return datetime.datetime(1970, 1, 1, tzinfo=timezone.get_current_timezone())
def last_scheduled_correction_factor(node):
"""Returns the time correction factor for a node.
The monitor data may be outdated, because of recent scheduling for a given node.
The return value is between 0 and 1, higher value indicates more time since the
last scheduling for the given node.
"""
factor = 0
max_time_diff = settings.SCHEDULER_TIME_SENSITIVITY_IN_SECONDS
current_time = timezone.now()
time_difference_in_seconds = (
current_time - get_time_stamp(node)).total_seconds()
factor = time_difference_in_seconds/float(max_time_diff)
if factor > 1:
factor = 1
elif factor < 0:
factor = 1
logger.info('Scheduler set factor to %s', unicode(factor))
return factor
def has_traits(traits, node): def has_traits(traits, node):
"""True, if the node has all specified traits; otherwise, false. """True, if the node has all specified traits; otherwise, false.
""" """
...@@ -142,11 +213,27 @@ def free_cpu_time(node): ...@@ -142,11 +213,27 @@ def free_cpu_time(node):
Higher values indicate more idle time. Higher values indicate more idle time.
""" """
try: try:
activity = node.cpu_usage / 100 free_cpu_percent = 1 - node.cpu_usage
inactivity = 1 - activity weight = node.cpu_weight
cores = node.num_cores weighted_value = free_cpu_percent * weight
return cores * inactivity return weighted_value
except TypeError as e:
logger.exception('Got incorrect monitoring data for node %s. %s',
unicode(node), unicode(e))
return 0 # will result lowest priority
def free_ram(node):
"""Get an indicator number for free RAM on the node.
Higher value indicates more RAM.
"""
try:
free_ram_percent = 1 - node.ram_usage
weight = node.ram_weight
weighted_value = free_ram_percent * weight
return weighted_value
except TypeError as e: except TypeError as e:
logger.warning('Got incorrect monitoring data for node %s. %s', logger.exception('Got incorrect monitoring data for node %s. %s',
unicode(node), unicode(e)) unicode(node), unicode(e))
return False # monitoring data is incorrect return 0 # will result lowest priority
# -*- coding: utf-8 -*-
# Generated by Django 1.11.6 on 2017-12-13 20:18
from __future__ import unicode_literals
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('vm', '0002_interface_model'),
]
operations = [
migrations.AddField(
model_name='node',
name='cpu_weight',
field=models.FloatField(default=1.0, help_text='Indicates the relative CPU power of this node.', verbose_name='CPU Weight'),
),
migrations.AddField(
model_name='node',
name='ram_weight',
field=models.FloatField(default=1.0, help_text='Indicates the relative RAM quantity of this node.', verbose_name='RAM Weight'),
),
migrations.AddField(
model_name='node',
name='time_stamp',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now, help_text='A timestamp for the node, used by the scheduler.', verbose_name='Last Scheduled Time Stamp'),
preserve_default=False,
),
]
...@@ -865,6 +865,53 @@ class Instance(AclBase, VirtualMachineDescModel, StatusModel, OperatedMixin, ...@@ -865,6 +865,53 @@ class Instance(AclBase, VirtualMachineDescModel, StatusModel, OperatedMixin,
def metric_prefix(self): def metric_prefix(self):
return 'vm.%s' % self.vm_name return 'vm.%s' % self.vm_name
class MonitorUnavailableException(Exception):
"""Exception for monitor_info()
Indicates the unavailability of the monitoring server.
"""
pass
def monitor_info(self):
metrics = ('cpu.percent', 'memory.usage')
prefix = self.metric_prefix
params = [('target', '%s.%s' % (prefix, metric))
for metric in metrics]
params.append(('from', '-5min'))
params.append(('format', 'json'))
try:
logger.info('%s %s', settings.GRAPHITE_URL, params)
response = requests.get(settings.GRAPHITE_URL, params=params)
retval = {}
for target in response.json():
# Example:
# {"target": "circle.vm.{name}.cpu.usage",
# "datapoints": [[0.6, 1403045700], [0.5, 1403045760]
try:
metric = target['target']
if metric.startswith(prefix):
metric = metric[len(prefix):]
else:
continue
value = target['datapoints'][-2][0]
retval[metric] = float(value)
except (KeyError, IndexError, ValueError):
continue
return retval
except Exception:
logger.exception('Monitor server unavailable: ')
raise Instance.MonitorUnavailableException()
def cpu_usage(self):
return self.monitor_info().get('cpu.percent')
def ram_usage(self):
return self.monitor_info().get('memory.usage')
@contextmanager @contextmanager
def activity(self, code_suffix, readable_name, on_abort=None, def activity(self, code_suffix, readable_name, on_abort=None,
on_commit=None, task_uuid=None, user=None, on_commit=None, task_uuid=None, user=None,
......
...@@ -30,7 +30,7 @@ from time import time, sleep ...@@ -30,7 +30,7 @@ from time import time, sleep
from django.conf import settings from django.conf import settings
from django.db.models import ( from django.db.models import (
CharField, IntegerField, ForeignKey, BooleanField, ManyToManyField, CharField, IntegerField, ForeignKey, BooleanField, ManyToManyField,
FloatField, permalink, Sum FloatField, DateTimeField, permalink, Sum
) )
from django.utils import timezone from django.utils import timezone
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
...@@ -128,11 +128,15 @@ class Node(OperatedMixin, TimeStampedModel): ...@@ -128,11 +128,15 @@ class Node(OperatedMixin, TimeStampedModel):
enabled = BooleanField(verbose_name=_('enabled'), default=False, enabled = BooleanField(verbose_name=_('enabled'), default=False,
help_text=_('Indicates whether the node can ' help_text=_('Indicates whether the node can '
'be used for hosting.')) 'be used for hosting.'))
schedule_enabled = BooleanField(verbose_name=_('schedule enabled'), schedule_enabled = BooleanField(
default=False, help_text=_( verbose_name=_('schedule enabled'),
'Indicates whether a vm can be ' default=False,
'automatically scheduled to this ' help_text=_(
'node.')) 'Indicates whether a vm can be '
'automatically scheduled to this '
'node.'
)
)
traits = ManyToManyField(Trait, blank=True, traits = ManyToManyField(Trait, blank=True,
help_text=_("Declared traits."), help_text=_("Declared traits."),
verbose_name=_('traits')) verbose_name=_('traits'))
...@@ -140,6 +144,21 @@ class Node(OperatedMixin, TimeStampedModel): ...@@ -140,6 +144,21 @@ class Node(OperatedMixin, TimeStampedModel):
overcommit = FloatField(default=1.0, verbose_name=_("overcommit ratio"), overcommit = FloatField(default=1.0, verbose_name=_("overcommit ratio"),
help_text=_("The ratio of total memory with " help_text=_("The ratio of total memory with "
"to without overcommit.")) "to without overcommit."))
ram_weight = FloatField(
default=1.0,
help_text=_("Indicates the relative RAM quantity of this node."),
verbose_name=_("RAM Weight")
)
cpu_weight = FloatField(
default=1.0,
help_text=_("Indicates the relative CPU power of this node."),
verbose_name=_("CPU Weight")
)
time_stamp = DateTimeField(
auto_now_add=True,
help_text=_("A timestamp for the node, used by the scheduler."),
verbose_name=_("Last Scheduled Time Stamp")
)
class Meta: class Meta:
app_label = 'vm' app_label = 'vm'
...@@ -162,7 +181,7 @@ class Node(OperatedMixin, TimeStampedModel): ...@@ -162,7 +181,7 @@ class Node(OperatedMixin, TimeStampedModel):
self.get_remote_queue_name("vm", "fast") self.get_remote_queue_name("vm", "fast")
self.get_remote_queue_name("vm", "slow") self.get_remote_queue_name("vm", "slow")
self.get_remote_queue_name("net", "fast") self.get_remote_queue_name("net", "fast")
except: except Exception:
return False return False
else: else:
return True return True
...@@ -341,19 +360,25 @@ class Node(OperatedMixin, TimeStampedModel): ...@@ -341,19 +360,25 @@ class Node(OperatedMixin, TimeStampedModel):
# Example: # Example:
# {"target": "circle.szianode.cpu.usage", # {"target": "circle.szianode.cpu.usage",
# "datapoints": [[0.6, 1403045700], [0.5, 1403045760] # "datapoints": [[0.6, 1403045700], [0.5, 1403045760]
logger.info('MONITOR_TARGET: %s', target)
try: try:
metric = target['target'] metric = target['target']
if metric.startswith(prefix): if metric.startswith(prefix):
metric = metric[len(prefix):] metric = metric[len(prefix):]
else: else:
logger.info('MONITOR_MET: %s %s', target, metric)
continue continue
value = target['datapoints'][-2][0] value = target['datapoints'][-1][0]
if value is None:
value = target['datapoints'][-2][0]
retval[metric] = float(value) retval[metric] = float(value)
logger.info('MONITOR_RETVAL: %s %s, %s', target['target'], metric, retval[metric])
except (KeyError, IndexError, ValueError, TypeError): except (KeyError, IndexError, ValueError, TypeError):
logger.info('MONITOR_ERR: %s %s', metric, value)
continue continue
return retval return retval
except: except Exception:
logger.exception('Unhandled exception: ') logger.exception('Unhandled exception: ')
return self.remote_query(vm_tasks.get_node_metrics, timeout=30, return self.remote_query(vm_tasks.get_node_metrics, timeout=30,
priority="fast") priority="fast")
...@@ -416,7 +441,7 @@ class Node(OperatedMixin, TimeStampedModel): ...@@ -416,7 +441,7 @@ class Node(OperatedMixin, TimeStampedModel):
vm_state_changed hook. vm_state_changed hook.
""" """
domains = {} domains = {}
domain_list = self.remote_query(vm_tasks.list_domains_info, timeout=5, domain_list = self.remote_query(vm_tasks.list_domains_info, timeout=10,
priority="fast") priority="fast")
if domain_list is None: if domain_list is None:
logger.info("Monitoring failed at: %s", self.name) logger.info("Monitoring failed at: %s", self.name)
...@@ -425,7 +450,7 @@ class Node(OperatedMixin, TimeStampedModel): ...@@ -425,7 +450,7 @@ class Node(OperatedMixin, TimeStampedModel):
# [{'name': 'cloud-1234', 'state': 'RUNNING', ...}, ...] # [{'name': 'cloud-1234', 'state': 'RUNNING', ...}, ...]
try: try:
id = int(i['name'].split('-')[1]) id = int(i['name'].split('-')[1])
except: except Exception:
pass # name format doesn't match pass # name format doesn't match
else: else:
domains[id] = i['state'] domains[id] = i['state']
......
...@@ -17,7 +17,9 @@ ...@@ -17,7 +17,9 @@
import logging import logging
from django.utils import timezone from django.utils import timezone
from datetime import timedelta
from django.utils.translation import ugettext_noop from django.utils.translation import ugettext_noop
from django.conf import settings
from manager.mancelery import celery from manager.mancelery import celery
from vm.models import Node, Instance from vm.models import Node, Instance
...@@ -33,7 +35,7 @@ def update_domain_states(): ...@@ -33,7 +35,7 @@ def update_domain_states():
@celery.task(ignore_result=True) @celery.task(ignore_result=True)
def garbage_collector(timeout=15): def garbage_collector(offset=timezone.timedelta(seconds=20)):
"""Garbage collector for instances. """Garbage collector for instances.
Suspends and destroys expired instances. Suspends and destroys expired instances.
...@@ -42,8 +44,10 @@ def garbage_collector(timeout=15): ...@@ -42,8 +44,10 @@ def garbage_collector(timeout=15):
:type timeout: int :type timeout: int
""" """
now = timezone.now() now = timezone.now()
bw = 0
for i in Instance.objects.filter(destroyed_at=None).all(): for i in Instance.objects.filter(destroyed_at=None).all():
if i.time_of_delete and now > i.time_of_delete: if i.time_of_delete and now > i.time_of_delete + offset and bw < 20:
bw = bw + 1
i.destroy.async(system=True) i.destroy.async(system=True)
logger.info("Expired instance %d destroyed.", i.pk) logger.info("Expired instance %d destroyed.", i.pk)
try: try:
...@@ -57,10 +61,11 @@ def garbage_collector(timeout=15): ...@@ -57,10 +61,11 @@ def garbage_collector(timeout=15):
logger.debug('Could not notify owner of instance %d .%s', logger.debug('Could not notify owner of instance %d .%s',
i.pk, unicode(e)) i.pk, unicode(e))
elif (i.time_of_suspend and now > i.time_of_suspend and elif (i.time_of_suspend and now > i.time_of_suspend and
i.state == 'RUNNING'): i.state == 'RUNNING' and bw < 20):
bw = bw + 1
logger.info("Expired instance %d suspended." % i.pk) logger.info("Expired instance %d suspended." % i.pk)
try: try:
i.sleep.async(system=True) i.sleep.async(system=True)
i.owner.profile.notify( i.owner.profile.notify(
ugettext_noop('%(instance)s suspended'), ugettext_noop('%(instance)s suspended'),
ugettext_noop( ugettext_noop(
...@@ -68,8 +73,8 @@ def garbage_collector(timeout=15): ...@@ -68,8 +73,8 @@ def garbage_collector(timeout=15):
'has been suspended due to expiration. ' 'has been suspended due to expiration. '
'You can resume or destroy it.'), 'You can resume or destroy it.'),
instance=i.name, url=i.get_absolute_url()) instance=i.name, url=i.get_absolute_url())
except ActivityInProgressError: except ActivityInProgressError:
logger.error("Expired instance %d can't be destroyed due the AtctivityInPorgressError.", i.pk) logger.error("Expired instance %d can't be destroyed due the AtctivityInPorgressError.", i.pk)
except Exception as e: except Exception as e:
logger.info('Could not notify owner of instance %d .%s', logger.info('Could not notify owner of instance %d .%s',
i.pk, unicode(e)) i.pk, unicode(e))
...@@ -77,4 +82,44 @@ def garbage_collector(timeout=15): ...@@ -77,4 +82,44 @@ def garbage_collector(timeout=15):
logger.debug("Instance %d expires soon." % i.pk) logger.debug("Instance %d expires soon." % i.pk)
i.notify_owners_about_expiration() i.notify_owners_about_expiration()
else: else:
logger.debug("Instance %d didn't expire." % i.pk) logger.debug("Instance %d didn't expire. bw:%d", i.pk, bw)
@celery.task(ignore_result=True)
def auto_migrate():
"""Auto migration task for runtime scaling
"""
time_limit = settings.AUTO_MIGRATION_TIME_LIMIT_IN_HOURS
available_time = timedelta(hours=int(time_limit))
deadline = timezone.now() + available_time
while timezone.now() < deadline:
migrate_one()
def migrate_one():
"""Migrate a VM syncronously.
The target node chosen by the scheduler.
"""
nodes = [n for n in Node.objects.filter(enabled=True) if n.online]
node_max_cpu = max(nodes, key=lambda x: x.cpu_usage / x.cpu_weight)
node_max_ram = max(nodes, key=lambda x: x.ram_usage / x.ram_weight)
if node_max_cpu.cpu_usage > node_max_ram.ram_usage:
try:
instance_to_migrate = max(Instance.objects.filter(node=node_max_cpu.pk),
key=lambda x: x.cpu_usage())
instance_to_migrate.migrate(system=True)
except Instance.MonitorUnavailableException:
instance_to_migrate = max(Instance.objects.filter(node=node_max_cpu.pk),
key=(lambda x: x.get_vm_desc()["vcpu"] *
x.get_vm_desc()["cpu_share"]))
instance_to_migrate.migrate(system=True)
else:
try:
instance_to_migrate = max(Instance.objects.filter(node=node_max_ram.pk),
key=lambda x: x.ram_usage())
instance_to_migrate.migrate(system=True)
except Instance.MonitorUnavailableException:
instance_to_migrate = max(Instance.objects.filter(node=node_max_cpu.pk),
key=lambda x: x.get_vm_desc()["memory"])
instance_to_migrate.migrate(system=True)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment