[v3 1/1] Virtual machine migration

From: Simon Jin <simonjin@linux.vnet.ibm.com> v3-v2: address commets from vianac@linux.vnet.ibm.com. Signed-off-by: Simon Jin <simonjin@linux.vnet.ibm.com> --- docs/API.md | 5 ++ src/kimchi/control/vms.py | 1 + src/kimchi/i18n.py | 9 ++++ src/kimchi/model/vms.py | 116 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+) diff --git a/docs/API.md b/docs/API.md index 9b866f3..17073c8 100644 --- a/docs/API.md +++ b/docs/API.md @@ -132,6 +132,11 @@ the following general conventions: It emulates the power reset button on a machine. Note that there is a risk of data loss caused by reset without the guest OS shutdown. * connect: Prepare the connection for spice or vnc +* migrate: Migrate a virtual machine to a remote server, only support live mode without block migration. + * remoteHost: IP address or hostname of the remote server. + * user: User of the remote server. + * passwd: Passwd of user on remote server. + * secure: Migrate in P2P secure tunel mode. * clone: Create a new VM identical to this VM. The new VM's name, UUID and network MAC addresses will be generated automatically. Each existing diff --git a/src/kimchi/control/vms.py b/src/kimchi/control/vms.py index a1589ef..e6870e0 100644 --- a/src/kimchi/control/vms.py +++ b/src/kimchi/control/vms.py @@ -42,6 +42,7 @@ class VM(Resource): for ident, node in sub_nodes.items(): setattr(self, ident, node(model, self.ident)) self.start = self.generate_action_handler('start') + self.migrate = self.generate_action_handler_task('migrate', ['remoteHost', 'user', 'passwd', 'secure']) self.poweroff = self.generate_action_handler('poweroff') self.shutdown = self.generate_action_handler('shutdown') self.reset = self.generate_action_handler('reset') diff --git a/src/kimchi/i18n.py b/src/kimchi/i18n.py index e823f2b..8d87671 100644 --- a/src/kimchi/i18n.py +++ b/src/kimchi/i18n.py @@ -105,6 +105,15 @@ messages = { "KCHVM0033E": _("Virtual machine '%(name)s' must be stopped before cloning it."), "KCHVM0034E": _("Insufficient disk space to clone virtual machine '%(name)s'"), "KCHVM0035E": _("Unable to clone VM '%(name)s'. Details: %(err)s"), + "KCHVM0036E": _("Can't migrate to localhost."), + "KCHVM0037E": _("Can't migrate vm %(name)s on Hypervisor arch %(srcarch)s.to a different Hypervisor arch %(destarch)s."), + "KCHVM0038E": _("Can't migrate from %(srchyp)s to a different Hypervisor type %(desthyp)s."), + "KCHVM0039E": _("Can't migrate vm %(name)s, vm has passthrough device %(hostdevs)s."), + "KCHVM0040E": _("Failed to set migrateSetMaxDowntime when migrate vm %(name)s. Details: %(err)s "), + "KCHVM0041E": _("Can not migrate vm %(name)s when its in %(state)s state."), + "KCHVM0042E": _("Failed Migrate vm %(name)s due error: %(err)s"), + "KCHVM0043E": _("Failed destory vm %(name)s from migration recover. Details: %(err)s"), + "KCHVM0044E": _("Failed disconnect %(remoteuri)s when finish migrate vm %(name)s. Details: %(err)s"), "KCHVMHDEV0001E": _("VM %(vmid)s does not contain directly assigned host device %(dev_name)s."), "KCHVMHDEV0002E": _("The host device %(dev_name)s is not allowed to directly assign to VM."), diff --git a/src/kimchi/model/vms.py b/src/kimchi/model/vms.py index d194049..ed8f96f 100644 --- a/src/kimchi/model/vms.py +++ b/src/kimchi/model/vms.py @@ -33,6 +33,7 @@ from cherrypy.process.plugins import BackgroundTask from kimchi import model, vnc from kimchi.config import READONLY_POOL_TYPE from kimchi.exception import InvalidOperation, InvalidParameter +from kimchi.model.libvirtconnection import LibvirtConnection from kimchi.exception import NotFoundError, OperationFailed from kimchi.model.config import CapabilitiesModel from kimchi.model.tasks import TaskModel @@ -790,6 +791,121 @@ class VMModel(object): except libvirt.libvirtError as e: raise OperationFailed("KCHVM0019E", {'name': name, 'err': e.get_error_message()}) + + def _preCheck(self, name, destConn, remoteHost): + kimchi_log.debug('precheck migrate %s' % name) + + _destConn = destConn.get() + if remoteHost in ['localhost', '127.0.0.1']: + kimchi_log.debug('vm %s Can not migrate to localhost.' % name) + raise OperationFailed("KCHVM0036E", {'name': name, + 'remoteHost': remoteHost}) + destarch = _destConn.getInfo()[0] + if self.conn.get().getInfo()[0] != destarch: + kimchi_log.debug('vm %s can not migrate to different arch server.' % (name, destarch)) + raise OperationFailed("KCHVM0037E", {'name': name, + 'srcarch': self.conn.get().getInfo()[0], + 'destarch': destarch}) + desthyp = _destConn.getType() + if self.conn.get().getType() != desthyp: + kimchi_log.debug('vm %s can not migrate to a different hypervisor' % name) + raise OperationFailed("KCHVM0038E", {'name': self.name, + 'srchyp':self.conn.get().getType(), + 'desthyp':desthyp}) + + + #model.host import DOM_STATE_MAP, vmhostdevs import host, + #to void loop import DOM_STATE_MAP, move import VMHostDevsModel here + from kimchi.model.vmhostdevs import VMHostDevsModel + + #check if there is any passthrough devices belong to this vm + vmhostdevs = VMHostDevsModel(conn = self.conn) + hostdevs = vmhostdevs.get_list(name) + if hostdevs: + raise OperationFailed("KCHVM0039E", {'name' : name, + 'hostdevs': hostdevs}) + #open destination connect to migrate + @staticmethod + def _getRemoteConn(self, remoteHost, user, transport = 'ssh'): + duri = 'qemu+%s://%s@%s/system' % (transport, + user,remoteHost) + conn = LibvirtConnection(duri) + return conn + + def _do_migrate(self, name, destConn, remoteHost, secure, cb): + _destConn = destConn.get() + + cb('starting a migration') + kimchi_log.debug('migrate %s start' % name) + dom = self.get_vm(name, self.conn) + flag = 0 + if secure: + flag |= (libvirt.VIR_MIGRATE_PEER2PEER | + libvirt.VIR_MIGRATE_TUNNELLED) + + state = DOM_STATE_MAP[dom.info()[0]] + if state in ['running','paused']: + flag |= libvirt.VIR_MIGRATE_LIVE + try: + dom.migrateSetMaxDowntime(0) + except libvirt.libvirtError as e: + raise OperationFailed("KCHVM0040E", + {'name': name, 'err': e.get_error_message()}) + elif state == 'shutoff': + flag |= libvirt.VIR_MIGRATE_OFFLINE + else: + kimchi_log.debug('Can not migrate vm %s when its in %s.' % (name, state)) + raise OperationFailed("KCHVM0041E", {'name': self.name}) + try: + dom.migrate(_destConn, flag, None, None, 0) + except libvirt.libvirtError as e: + kimchi_log.error('migrate %s to %s failed' % (name, + remoteHost)) + self._recover(cb) + raise OperationFailed('KCHVM0042E', {'err': e.message, + 'name': name}) + finally: + self._finish(name, destConn, cb) + + def migrate(self, name, remoteHost, user, passwd = None, secure = None): + destconn = self._getRemoteConn(remoteHost, user, passwd) + self._preCheck(name, destconn, remoteHost, secure) + + task_id = add_task('/vms/%s/migrate' % name, self._do_migrate, self.objstore, + {'name' : name, 'destconn': destconn, + 'remoteHost': remoteHost, 'secure': secure}) + + # Record vm migrate task for future querying + try: + with self.objstore as session: + session.store('migrate', name, task_id) + return task_id + except Exception as e: + raise OperationFailed('KCHPOOL0037E', {'err': e.message}) + + def _recover(self, name, destconn, cb): + destvm = self.get_vm(name, destconn) + #recover + cb('recover from a failed migration',False) + kimchi_log.debug('recover from a failed migrate %s' % name) + try: + destvm.destroy() + kimchi_log.error("destory migrate vm on destination server") + except libvirt.libvirtError as e: + raise OperationFailed('KCHVM0043E', {'name' : name, + 'err': e.message}) + + def _finish(self, name, destConn, cb): + _destConn = destConn.get() + kimchi_log.debug('finished migrate %s' % name) + try: + _destConn.close() + except libvirt.libvirtError as e: + raise OperationFailed('KCHVM0044E', {'name' : name, + 'remoteuri' : _destConn.getURI(), + 'err': e.message}) + finally: + cb('Migrate finished', True) def poweroff(self, name): dom = self.get_vm(name, self.conn) -- 1.8.3.1

On 13-11-2014 05:02, simonjin@linux.vnet.ibm.com wrote:
+ "KCHVM0043E": _("Failed destory vm %(name)s from migration recover. Details: %(err)s"),
*destroy
@@ -33,6 +33,7 @@ from cherrypy.process.plugins import BackgroundTask from kimchi import model, vnc from kimchi.config import READONLY_POOL_TYPE from kimchi.exception import InvalidOperation, InvalidParameter +from kimchi.model.libvirtconnection import LibvirtConnection from kimchi.exception import NotFoundError, OperationFailed from kimchi.model.config import CapabilitiesModel from kimchi.model.tasks import TaskModel
We keep import statements sorted in alphabetical order. You need to move the line "from kimchi.model.libvirtconnection..." two lines below, right between "from kimchi.model.config..." and "from kimchi.model.tasks...", in alphabetical order.
+ raise OperationFailed("KCHVM0036E", {'name': name, + 'remoteHost': remoteHost})
The error message KCHVM0036E, in src/kimchi/i18n.py, doesn't contain the format masks 'name' and 'remoteHost' as provided in the exception above:
"KCHVM0036E": _("Can't migrate to localhost."),
You need to provide exactly the same parameters declared in the error message. You either don't pass 'name' and 'remoteHost' when building that exception or you add 'name' and 'remoteHost' somewhere in the error message.
+ destarch = _destConn.getInfo()[0] + if self.conn.get().getInfo()[0] != destarch: + kimchi_log.debug('vm %s can not migrate to different arch server.' % (name, destarch)) + raise OperationFailed("KCHVM0037E", {'name': name, + 'srcarch': self.conn.get().getInfo()[0], + 'destarch': destarch})
The code above is running "self.conn.get().getInfo()" more than once. You can store that value in a variable so you don't have to talk to libvirt multiple times. Actually, "self.conn" is used in more places later in the code, so that value can also be stored in a separate value. Try not to contact libvirt several times for the same information, it has a performance cost.
+ desthyp = _destConn.getType() + if self.conn.get().getType() != desthyp: + kimchi_log.debug('vm %s can not migrate to a different hypervisor' % name) + raise OperationFailed("KCHVM0038E", {'name': self.name, + 'srchyp':self.conn.get().getType(), + 'desthyp':desthyp})
Again, store the value from "self.conn.getType()" in a local variable so you don't need to contact libvirt repeatedly. "self.conn" should already be in a separate variable according to my comment above. Also, the error message KCHVM0038E contains the format masks 'srchyp' and 'desthyp' but you're providing 'name' as well. Those parameters don't match. And "self.name" is an invalid reference, you need to use the local variable "name".
+ #check if there is any passthrough devices belong to this vm + vmhostdevs = VMHostDevsModel(conn = self.conn)
Usually we keep the model objects as instance variables. That means that "vmhostdevs" should be declared in VMModel's constructor - along with many other model variables - as "self.vmhostdevs". Another useful verification to be added to "_preCheck" is to allow only one migration per VM happening at the same time. In order to do that, you need to check if a task with the 'target_uri' = '/vms/<vm-name>/migrate' is already running; that's the URI used when creating a migration task later in this code.
+ #open destination connect to migrate + @staticmethod + def _getRemoteConn(self, remoteHost, user, transport = 'ssh'): + duri = 'qemu+%s://%s@%s/system' % (transport, + user,remoteHost) + conn = LibvirtConnection(duri) + return conn
A @staticmethod shouldn't receive the parameter "self". You can read the section https://docs.python.org/2/library/functions.html#staticmethod to learn more about static methods in Python. When you need to call it, do something like: VMModel._getRemoteConn(...) Notice that the function is prefixed with the class, not the object instance, as its implementation doesn't depend on a specific instance.
def _do_migrate(self, name, destConn, remoteHost, secure, cb):
This function is supposed to be executed from inside a Task. The parameters of such function should be, in that exact order: def fn(cb, params): where "params" is a dict containing the other parameters. That means that the parameters "name", "destConn", "remoteHost" and "secure" should be inside the dict "params", and you need to extract them from the dict before using them inside "_do_migrate".
+ else: + kimchi_log.debug('Can not migrate vm %s when its in %s.' % (name, state)) + raise OperationFailed("KCHVM0041E", {'name': self.name})
The error message KCHVM0041E contains the format masks 'name' and 'state'. You need to provide both parameters when creating an exception with that message (i.e.'state' is missing). Also, "self.name" is an invalid reference - use "name" instead.
+ except libvirt.libvirtError as e: + kimchi_log.error('migrate %s to %s failed' % (name, + remoteHost)) + self._recover(cb) + raise OperationFailed('KCHVM0042E', {'err': e.message,
Are you sure that if an error occurs during migration, we need to undefine the remote domain? In my view, if the migration failed, there won't be a remote domain to undefine - well, it failed. And I couldn't find this on libvirt's documentation so I'm not sure of the behaviour in that case.
+ finally: + self._finish(name, destConn, cb)
The function "_finish" is called from inside the "finally" block, which means it will be called regardless of whether an exception was raised or not. However, that function calls "cb(..., True)", which indicates that the task has finished successfully. We should not call "cb(..., True)" if an exception was raised and the migration failed.
+ destconn = self._getRemoteConn(remoteHost, user, passwd)
The function "_getRemoteConn" was defined as:
def _getRemoteConn(self, remoteHost, user, transport = 'ssh'):
The parameters provided above don't match the defined parameters. The function expects a transport method in its last parameter but the code above is passing a password instead.
+ # Record vm migrate task for future querying + try: + with self.objstore as session: + session.store('migrate', name, task_id) + return task_id + except Exception as e: + raise OperationFailed('KCHPOOL0037E', {'err': e.message})
As I commented in the previous e-mail thread, the function "add_task" will add the task to the object store. You don't need to do that. Before sending every patch, you need to make sure it follows our code guideline. You can run the command "make check-local" on your shell and check its output. There are a few errors pointed out by that tool for this patch, so be sure to fix every one of them. How do you test this code? Were you able to perform a VM migration successfully using this patch? You can try running the command below on your shell: $ curl -s -k -u <username>:<password> -H 'Content-Type: application/json' -H 'Accept: application/json' https://localhost:8001/vms/<vm-name>/migrate -X POST -d '{"remoteHost": "<remote-host>", "user": "<remote-user>", "password": "<remote-password>", "secure": True}' and make sure the migration actually happened. Best regards, Crístian.

Thanks for your review. 于 2014年11月14日 01:54, Crístian Viana 写道:
On 13-11-2014 05:02, simonjin@linux.vnet.ibm.com wrote:
+ "KCHVM0043E": _("Failed destory vm %(name)s from migration recover. Details: %(err)s"),
*destroy
@@ -33,6 +33,7 @@ from cherrypy.process.plugins import BackgroundTask from kimchi import model, vnc from kimchi.config import READONLY_POOL_TYPE from kimchi.exception import InvalidOperation, InvalidParameter +from kimchi.model.libvirtconnection import LibvirtConnection from kimchi.exception import NotFoundError, OperationFailed from kimchi.model.config import CapabilitiesModel from kimchi.model.tasks import TaskModel
We keep import statements sorted in alphabetical order. You need to move the line "from kimchi.model.libvirtconnection..." two lines below, right between "from kimchi.model.config..." and "from kimchi.model.tasks...", in alphabetical order.
OK
+ raise OperationFailed("KCHVM0036E", {'name': name, + 'remoteHost': remoteHost})
The error message KCHVM0036E, in src/kimchi/i18n.py, doesn't contain the format masks 'name' and 'remoteHost' as provided in the exception above:
"KCHVM0036E": _("Can't migrate to localhost."),
You need to provide exactly the same parameters declared in the error message. You either don't pass 'name' and 'remoteHost' when building that exception or you add 'name' and 'remoteHost' somewhere in the error message.
+ destarch = _destConn.getInfo()[0] + if self.conn.get().getInfo()[0] != destarch: + kimchi_log.debug('vm %s can not migrate to different arch server.' % (name, destarch)) + raise OperationFailed("KCHVM0037E", {'name': name, + 'srcarch': self.conn.get().getInfo()[0], + 'destarch': destarch})
The code above is running "self.conn.get().getInfo()" more than once. You can store that value in a variable so you don't have to talk to libvirt multiple times. Actually, "self.conn" is used in more places later in the code, so that value can also be stored in a separate value. Try not to contact libvirt several times for the same information, it has a performance cost.
self.conn it self doesn't cost anything.
+ desthyp = _destConn.getType() + if self.conn.get().getType() != desthyp: + kimchi_log.debug('vm %s can not migrate to a different hypervisor' % name) + raise OperationFailed("KCHVM0038E", {'name': self.name, + 'srchyp':self.conn.get().getType(), + 'desthyp':desthyp})
Again, store the value from "self.conn.getType()" in a local variable so you don't need to contact libvirt repeatedly. "self.conn" should already be in a separate variable according to my comment above.
Also, the error message KCHVM0038E contains the format masks 'srchyp' and 'desthyp' but you're providing 'name' as well. Those parameters don't match.
And "self.name" is an invalid reference, you need to use the local variable "name".
+ #check if there is any passthrough devices belong to this vm + vmhostdevs = VMHostDevsModel(conn = self.conn)
Usually we keep the model objects as instance variables. That means that "vmhostdevs" should be declared in VMModel's constructor - along with many other model variables - as "self.vmhostdevs".
Due to the reason in code that VMHostDevsModel can't be imported at the top of the class, the workaround is import and get the instance here.
Another useful verification to be added to "_preCheck" is to allow only one migration per VM happening at the same time. In order to do that, you need to check if a task with the 'target_uri' = '/vms/<vm-name>/migrate' is already running; that's the URI used when creating a migration task later in this code.
Will do.
+ #open destination connect to migrate + @staticmethod + def _getRemoteConn(self, remoteHost, user, transport = 'ssh'): + duri = 'qemu+%s://%s@%s/system' % (transport, + user,remoteHost) + conn = LibvirtConnection(duri) + return conn
A @staticmethod shouldn't receive the parameter "self". You can read the section https://docs.python.org/2/library/functions.html#staticmethod to learn more about static methods in Python. When you need to call it, do something like:
VMModel._getRemoteConn(...)
Notice that the function is prefixed with the class, not the object instance, as its implementation doesn't depend on a specific instance.
OK
def _do_migrate(self, name, destConn, remoteHost, secure, cb):
This function is supposed to be executed from inside a Task. The parameters of such function should be, in that exact order:
def fn(cb, params):
where "params" is a dict containing the other parameters. That means that the parameters "name", "destConn", "remoteHost" and "secure" should be inside the dict "params", and you need to extract them from the dict before using them inside "_do_migrate".
OK
+ else: + kimchi_log.debug('Can not migrate vm %s when its in %s.' % (name, state)) + raise OperationFailed("KCHVM0041E", {'name': self.name})
The error message KCHVM0041E contains the format masks 'name' and 'state'. You need to provide both parameters when creating an exception with that message (i.e.'state' is missing).
Also, "self.name" is an invalid reference - use "name" instead.
OK
+ except libvirt.libvirtError as e: + kimchi_log.error('migrate %s to %s failed' % (name, + remoteHost)) + self._recover(cb) + raise OperationFailed('KCHVM0042E', {'err': e.message,
Are you sure that if an error occurs during migration, we need to undefine the remote domain? In my view, if the migration failed, there won't be a remote domain to undefine - well, it failed. And I couldn't find this on libvirt's documentation so I'm not sure of the behaviour in that case.
OK
+ finally: + self._finish(name, destConn, cb)
The function "_finish" is called from inside the "finally" block, which means it will be called regardless of whether an exception was raised or not. However, that function calls "cb(..., True)", which indicates that the task has finished successfully. We should not call "cb(..., True)" if an exception was raised and the migration failed.
+ destconn = self._getRemoteConn(remoteHost, user, passwd)
The function "_getRemoteConn" was defined as:
def _getRemoteConn(self, remoteHost, user, transport = 'ssh'):
The parameters provided above don't match the defined parameters. The function expects a transport method in its last parameter but the code above is passing a password instead.
+ # Record vm migrate task for future querying + try: + with self.objstore as session: + session.store('migrate', name, task_id) + return task_id + except Exception as e: + raise OperationFailed('KCHPOOL0037E', {'err': e.message})
As I commented in the previous e-mail thread, the function "add_task" will add the task to the object store. You don't need to do that.
Will need this to verify whether the vm is already in migration, right ?
Before sending every patch, you need to make sure it follows our code guideline. You can run the command "make check-local" on your shell and check its output. There are a few errors pointed out by that tool for this patch, so be sure to fix every one of them.
How do you test this code? Were you able to perform a VM migration successfully using this patch? You can try running the command below on your shell:
$ curl -s -k -u <username>:<password> -H 'Content-Type: application/json' -H 'Accept: application/json' https://localhost:8001/vms/<vm-name>/migrate -X POST -d '{"remoteHost": "<remote-host>", "user": "<remote-user>", "password": "<remote-password>", "secure": True}'
and make sure the migration actually happened.
Best regards, Crístian.

On 14-11-2014 07:04, simonjin wrote:
The code above is running "self.conn.get().getInfo()" more than once. You can store that value in a variable so you don't have to talk to libvirt multiple times. Actually, "self.conn" is used in more places later in the code, so that value can also be stored in a separate value. Try not to contact libvirt several times for the same information, it has a performance cost.
self.conn it self doesn't cost anything.
But there's no need to declare a function parameter if the function doesn't use it.
Usually we keep the model objects as instance variables. That means that "vmhostdevs" should be declared in VMModel's constructor - along with many other model variables - as "self.vmhostdevs". Due to the reason in code that VMHostDevsModel can't be imported at the top of the class, the workaround is import and get the instance here.
Err, you're right, we have a huge dependency problem in the code right now... So you should keep that.
As I commented in the previous e-mail thread, the function "add_task" will add the task to the object store. You don't need to do that.
Will need this to verify whether the vm is already in migration, right ?
You definitely need that task later to track the migration process but the function "add_task" already stores the task in the object store. I mean you shouldn't store it yourself because it's already done. In order to check the migration task, you can lookup a task with the code: task = taskmodel.lookup(taskid); (where "taskmodel" is an instance of "TaskModel" and "taskid" is the Task ID) if you have the Task ID, or: alltasks = taskmodel.get_list() if you don't have the ID and you want to inspect all tasks (probably filtering by its 'target_uri' to find out which of them is related to migration).
participants (3)
-
Crístian Viana
-
simonjin
-
simonjin@linux.vnet.ibm.com