Hello,
ovirt-release-host-node-4.5.4-1.el8.x86_64
Today I found my cluster in an unconsinstent state
I have three nodes: ovirt-node2 ovirt-node3 ovirt-node4 with self hosted engine deployed
using external nfs storage
My first attempt was to launch hosted-engine --vm-statos on three nodes and I get three
inconsinstent states:
[root@ovirt-node2 ~]# hosted-engine --vm-status
The hosted engine configuration has not been retrieved from shared storage yet,
please ensure that ovirt-ha-agent service is running.
--== Host ovirt-node3.ovirt (id: 1) status ==--
Host ID : 1
Host timestamp : 1942858
Score : 3400
Engine status : unknown stale-data
Hostname : ovirt-node3.ovirt
Local maintenance : False
stopped : False
crc32 : 37cf5256
conf_on_shared_storage : True
local_conf_timestamp : 1942859
Status up-to-date : False
Extra metadata (valid at timestamp):
metadata_parse_version=1
metadata_feature_version=1
timestamp=1942858 (Sun Mar 12 01:26:20 2023)
host-id=1
score=3400
vm_conf_refresh_time=1942859 (Sun Mar 12 01:26:22 2023)
conf_on_shared_storage=True
maintenance=False
state=EngineDown
stopped=False
--== Host ovirt-node2.ovirt (id: 2) status ==--
Host ID : 2
Host timestamp : 4425500
Score : 3400
Engine status : unknown stale-data
Hostname : ovirt-node2.ovirt
Local maintenance : False
stopped : False
crc32 : ab944a8a
conf_on_shared_storage : True
local_conf_timestamp : 4425500
Status up-to-date : False
Extra metadata (valid at timestamp):
metadata_parse_version=1
metadata_feature_version=1
timestamp=4425500 (Sun Mar 12 01:26:01 2023)
host-id=2
score=3400
vm_conf_refresh_time=4425500 (Sun Mar 12 01:26:01 2023)
conf_on_shared_storage=True
maintenance=False
state=EngineUp
stopped=False
[root@ovirt-node3 ~]# hosted-engine --vm-status
--== Host ovirt-node4.ovirt (id: 3) status ==--
Host ID : 3
Host timestamp : 4452814
Score : 3400
Engine status : unknown stale-data
Hostname : ovirt-node4.ovirt
Local maintenance : False
stopped : False
crc32 : 95890d21
conf_on_shared_storage : True
local_conf_timestamp : 4452814
Status up-to-date : False
Extra metadata (valid at timestamp):
metadata_parse_version=1
metadata_feature_version=1
timestamp=4452814 (Sun Mar 12 01:25:55 2023)
host-id=3
score=3400
vm_conf_refresh_time=4452814 (Sun Mar 12 01:25:55 2023)
conf_on_shared_storage=True
maintenance=False
state=EngineDown
stopped=False
[root@ovirt-node4 ~]# hosted-engine --vm-status
--== Host ovirt-node3.ovirt (id: 1) status ==--
Host ID : 1
Host timestamp : 1942848
Score : 3400
Engine status : unknown stale-data
Hostname : ovirt-node3.ovirt
Local maintenance : False
stopped : False
crc32 : 7f645fbc
conf_on_shared_storage : True
local_conf_timestamp : 1942848
Status up-to-date : False
Extra metadata (valid at timestamp):
metadata_parse_version=1
metadata_feature_version=1
timestamp=1942848 (Sun Mar 12 01:26:10 2023)
host-id=1
score=3400
vm_conf_refresh_time=1942848 (Sun Mar 12 01:26:10 2023)
conf_on_shared_storage=True
maintenance=False
state=EngineDown
stopped=False
--== Host ovirt-node2.ovirt (id: 2) status ==--
Host ID : 2
Host timestamp : 4428404
Score : 3400
Engine status : unknown stale-data
Hostname : ovirt-node2.ovirt
Local maintenance : False
stopped : False
crc32 : af938ff8
conf_on_shared_storage : True
local_conf_timestamp : 4428404
Status up-to-date : False
Extra metadata (valid at timestamp):
metadata_parse_version=1
metadata_feature_version=1
timestamp=4428404 (Sun Mar 12 02:14:45 2023)
host-id=2
score=3400
vm_conf_refresh_time=4428404 (Sun Mar 12 02:14:45 2023)
conf_on_shared_storage=True
maintenance=False
state=EngineUp
stopped=False
--== Host ovirt-node4.ovirt (id: 3) status ==--
Host ID : 3
Host timestamp : 4470173
Score : 3400
Engine status : unknown stale-data
Hostname : ovirt-node4.ovirt
Local maintenance : False
stopped : False
crc32 : d8fdb650
conf_on_shared_storage : True
local_conf_timestamp : 4470173
Status up-to-date : False
Extra metadata (valid at timestamp):
metadata_parse_version=1
metadata_feature_version=1
timestamp=4470173 (Sun Mar 12 06:15:15 2023)
host-id=3
score=3400
vm_conf_refresh_time=4470173 (Sun Mar 12 06:15:15 2023)
conf_on_shared_storage=True
maintenance=False
state=EngineStarting
stopped=False
Obviously there is something weird happening.
Currently I put my cluster in global maintenance mode but I had to launch hosted-engine
--set-maintenance --mode=global both on node3 and node4
Please give me some hint.... During this weekend I received hundreds of mail telling that
hosted-engine went in inconsistent state
Show replies by date
Finally it seem that the problem was in the external nfs server, it failed rpc.gsssd and
the nfs service become unresponsive... so the hosted-engine configuration domain
wasn't reacheable