<html>

  <head>

    <meta content="text/html; charset=ISO-8859-1"

      http-equiv="Content-Type">

  </head>

  <body bgcolor="#FFFFFF" text="#000000">

    <div class="moz-cite-prefix"><br>

      Hi,<br>

      &nbsp;&nbsp; I reproduced this issue, and I believe it's a python bug.<br>

      &nbsp;&nbsp; 1. How to reproduce:<br>

      &nbsp;&nbsp; with the test case attached, put it under

      /usr/share/vdsm/tests/, run #./run_tests.sh superVdsmTests.py<br>

      &nbsp;&nbsp; and this issue will be reproduced.<br>

      &nbsp;&nbsp; 2.Log analyse:<br>

      &nbsp;&nbsp; We notice a strange pattern in this log: connectStorageServer

      be called twice, first supervdsm call succeed, second fails

      becasue of&nbsp; validateAccess().<br>

      &nbsp;&nbsp; That is because for the first call validateAccess returns

      normally and leave a child there, when the second validateAccess

      call arrives and multirprocessing manager is receiving the method

      message, it is just the time first child exit and SIGCHLD comming,

      this signal interrupted multiprocessing receive system call,

      python managers.py should handle INTR and retry recv() like we do

      in vdsm but it's not, so the second one raise error.<br>

      <pre wrap="">&gt;Thread-18::DEBUG::2013-01-22 10:41:03,570::misc::85::Storage.Misc.excCmd::(&lt;lambda&gt;) '/usr/bin/sudo -n /bin/mount -t nfs -o soft,nosharecache,timeo=600,retrans=6,nfsvers=3 192.168.0.1:/ovirt/silvermoon /rhev/data-center/mnt/192.168.0.1:_ovirt_silvermoon' (cwd None)

&gt;Thread-18::DEBUG::2013-01-22 10:41:03,607::misc::85::Storage.Misc.excCmd::(&lt;lambda&gt;) '/usr/bin/sudo -n /bin/mount -t nfs -o soft,nosharecache,timeo=600,retrans=6,nfsvers=3 192.168.0.1:/ovirt/undercity /rhev/data-center/mnt/192.168.0.1:_ovirt_undercity' (cwd None)

&gt;Thread-18::ERROR::2013-01-22 10:41:03,627::hsm::2215::Storage.HSM::(connectStorageServer) Could not connect to storageServer

&gt;Traceback (most recent call last):

&gt;  File "/usr/share/vdsm/storage/hsm.py", line 2211, in connectStorageServer

&gt;    conObj.connect()

&gt;  File "/usr/share/vdsm/storage/storageServer.py", line 303, in connect

&gt;    return self._mountCon.connect()

&gt;  File "/usr/share/vdsm/storage/storageServer.py", line 209, in connect

&gt;    fileSD.validateDirAccess(self.getMountObj().getRecord().fs_file)

&gt;  File "/usr/share/vdsm/storage/fileSD.py", line 55, in validateDirAccess

&gt;    (os.R_OK | os.X_OK))

&gt;  File "/usr/share/vdsm/supervdsm.py", line 81, in __call__

&gt;    return callMethod()

&gt;  File "/usr/share/vdsm/supervdsm.py", line 72, in &lt;lambda&gt;

&gt;    **kwargs)

&gt;  File "&lt;string&gt;", line 2, in validateAccess

&gt;  File "/usr/lib64/python2.6/multiprocessing/managers.py", line 740, in _callmethod

&gt;    raise convert_to_error(kind, result) 

</pre>

      the vdsm side receive RemoteError because of supervdsm server

      multiprocessing manager raise error KIND='TRACEBACK'<br>

      <pre>&nbsp;&gt;RemoteError: 

</pre>

      The upper part is the trace back from the client side, the

      following part is from server side:<br>

      <pre>&gt;---------------------------------------------------------------------------

&gt;Traceback (most recent call last):

&gt;  File "/usr/lib64/python2.6/multiprocessing/managers.py", line 214, in serve_client

&gt;    request = recv()

&gt;IOError: [Errno 4] Interrupted system call

&gt;---------------------------------------------------------------------------

</pre>

      Corresponding Python source code:managers.py(Server side)<br>

      <pre>&nbsp;&nbsp;&nbsp; def serve_client(self, conn):</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; '''</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Handle requests from the proxies in a particular process/thread</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; '''</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; util.debug('starting server thread to service %r',</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; threading.current_thread().name)</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; recv = conn.recv</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; send = conn.send</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; id_to_obj = self.id_to_obj</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; while not self.stop:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; try:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; methodname = obj = None</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; request = recv()&lt;------------------<font color="#3366ff">this line been interrupted by SIGCHLD</font></pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ident, methodname, args, kwds = request</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; obj, exposed, gettypeid = id_to_obj[ident]</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; if methodname not in exposed:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; raise AttributeError(</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 'method %r of %r object is not in exposed=%r' %</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; (methodname, type(obj), exposed)</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; )</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; function = getattr(obj, methodname)

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; try:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; res = function(*args, **kwds)</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; except Exception, e:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; msg = ('#ERROR', e)</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; else:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; typeid = gettypeid and gettypeid.get(methodname, None)</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; if typeid:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; rident, rexposed = self.create(conn, typeid, res)</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; token = Token(typeid, self.address, rident)</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; msg = ('#PROXY', (rexposed, token))</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; else:</pre>

      <pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; msg = ('#RETURN', res)

            except AttributeError:

                if methodname is None:

                    msg = ('#TRACEBACK', format_exc())

                else:

                    try:

                        fallback_func = self.fallback_mapping[methodname]

                        result = fallback_func(

                            self, conn, ident, obj, *args, **kwds

                            )

                        msg = ('#RETURN', result)

                    except Exception:

                        msg = ('#TRACEBACK', format_exc())

            except EOFError:

                util.debug('got EOF -- exiting thread serving %r',

                           threading.current_thread().name)

                sys.exit(0)

            except Exception:&lt;------<font color="#3366ff">does not handle IOError,INTR here should retry recv() </font>

                msg = ('#TRACEBACK', format_exc())

</pre>

      <br>

      <br>

      3. Actions we will take:<br>

      (1)As a work round we can first remove the zombie reaper from

      supervdsm server<br>

      (2)I'll see whether python has a fixed version for this<br>

      (3)Yaniv is working on changing vdsm/svdsm communication channel

      to pipe and handle it ourselves, I believe we'll get rid of this

      with that properly handled.<br>

      <br>

      <br>

      On 01/25/2013 06:00 AM, Dead Horse wrote:<br>

    </div>

    <blockquote

cite="mid:CAEWPe=qjozgWjK3U7-PDky8kToy8=R9A=ES32hQj1qmhhczTjA@mail.gmail.com"

      type="cite">

      <div dir="ltr">

        <div>Tried some manual edits to SD states in the dbase. The net

          result was I was able to get a node active. However as

          reconstructing the master storage domain kicked in it was

          unable to do so. It was also not able to recognize the other

          SD with similar failure modes to the unrecognized master

          above. Guessing the newer VDSM version borked things pretty

          good. So being as this is a test harness and the SD data is

          not worth saving I just smoked the all SD. I ran

          engine-cleanup and started from fresh and all is well now.<br>

          <br>

        </div>

        - DHC<br>

      </div>

      <div class="gmail_extra"><br>

        <br>

        <div class="gmail_quote">On Thu, Jan 24, 2013 at 11:53 AM, Dead

          Horse <span dir="ltr">&lt;<a moz-do-not-send="true"

              href="mailto:deadhorseconsulting@gmail.com"

              target="_blank">deadhorseconsulting@gmail.com</a>&gt;</span>

          wrote:<br>

          <blockquote class="gmail_quote" style="margin:0 0 0

            .8ex;border-left:1px #ccc solid;padding-left:1ex">

            <div dir="ltr">

              <div>

                <div>

                  <div>

                    <div>This test harness setup here consists of two

                      servers tied to NFS storage via IB (NFS mounts are

                      via IPoIB, NFS over RDMA is disabled) . All

                      storage domains are NFS. The issue does occur with

                      both servers on when attempting to bring them out

                      of maintenance mode with the end result being

                      non-operational due to storage attach fail.<br>

                      <br>

                    </div>

                    The current issue is now that with a working older

                    commit the master storage domain is "stuck" in state

                    "locked" and I see the secondary issue wherein VDSM

                    cannot seem to find or contact the master storage

                    domain even though it is there. I am can mount the

                    master storage domain manually and and all content

                    appears to be accounted for accordingly on either

                    host. <br>

                    <br>

                  </div>

                  Here is the current contents of the master storage

                  domain metadata:<br>

                  CLASS=Data<br>

                  DESCRIPTION=orgrimmar<br>

                  IOOPTIMEOUTSEC=1<br>

                  LEASERETRIES=3<br>

                  LEASETIMESEC=5<br>

                  LOCKPOLICY=<br>

                  LOCKRENEWALINTERVALSEC=5<br>

                  MASTER_VERSION=417<br>

                  POOL_DESCRIPTION=Azeroth<br>

POOL_DOMAINS=0549ee91-4498-4130-8c23-4c173b5c0959:Active,d8b55105-c90a-465d-9803-8130da9a671e:Active,67534cca-1327-462a-b455-a04464084b31:Active,c331a800-839d-4d23-9059-870a7471240a:Active,f8984825-ff8d-43d9-91db-0d0959f8bae9:Active,c434056e-96be-4702-8beb-82a408a5c8cb:Active,f7da73c7-b5fe-48b6-93a0-0c773018c94f:Active,82e3b34a-6f89-4299-8cd8-2cc8f973a3b4:Active,e615c975-6b00-469f-8fb6-ff58ae3fdb2c:Active,5bc86532-55f7-4a91-a52c-fad261f322d5:Active,1130b87a-3b34-45d6-8016-d435825c68ef:Active<br>

                  POOL_SPM_ID=1<br>

                  POOL_SPM_LVER=6<br>

                  POOL_UUID=f90a0d1c-06ca-11e2-a05b-00151712f280<br>

                  REMOTE_PATH=192.168.0.1:/ovirt/orgrimmar<br>

                  ROLE=Master<br>

                  SDUUID=67534cca-1327-462a-b455-a04464084b31<br>

                  TYPE=NFS<br>

                  VERSION=3<br>

                  _SHA_CKSUM=1442bb078fd8c9468d241ff141e9bf53839f0721<br>

                  <br>

                  So now with the older working commit I now get this

                  the "StoragePoolMasterNotFound: Cannot find master

                  domain" error (prior details above when I worked

                  backwards to that commit)<br>

                  <br>

                  This is odd as the nodes can definitely reach the

                  master storage domain:<br>

                  <br>

                </div>

                showmount from one of the el6.3 nodes:<br>

                [root@kezan ~]# showmount -e 192.168.0.1<br>

                Export list for <a moz-do-not-send="true"

                  href="http://192.168.0.1" target="_blank">192.168.0.1</a>:<br>

                /ovirt/orgrimmar&nbsp;&nbsp;&nbsp; <a moz-do-not-send="true"

                  href="http://192.168.0.0/16" target="_blank">192.168.0.0/16</a><br>

                <br>

              </div>

              mount/ls from one of the nodes:<br>

              <div>

                <div>[root@kezan ~]# mount 192.168.0.1:/ovirt/orgrimmar

                  /mnt<br>

                  [root@kezan ~]# ls -al

                  /mnt/67534cca-1327-462a-b455-a04464084b31/dom_md/<br>

                  total 1100<br>

                  drwxr-xr-x 2 vdsm kvm&nbsp;&nbsp;&nbsp; 4096 Jan 24 11:44 .<br>

                  drwxr-xr-x 5 vdsm kvm&nbsp;&nbsp;&nbsp; 4096 Oct 19 16:16 ..<br>

                  -rw-rw---- 1 vdsm kvm 1048576 Jan 19 22:09 ids<br>

                  -rw-rw---- 1 vdsm kvm&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 0 Sep 25 00:46 inbox<br>

                  -rw-rw---- 1 vdsm kvm 2097152 Jan 10 13:33 leases<br>

                  -rw-r--r-- 1 vdsm kvm&nbsp;&nbsp;&nbsp;&nbsp; 903 Jan 10 13:39 metadata<br>

                  -rw-rw---- 1 vdsm kvm&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 0 Sep 25 00:46 outbox<br>

                  <br>

                  <br>

                </div>

                <div>- DHC<br>

                </div>

                <div><br>

                </div>

              </div>

            </div>

            <div class="HOEnZb">

              <div class="h5">

                <div class="gmail_extra"><br>

                  <br>

                  <div class="gmail_quote">On Thu, Jan 24, 2013 at 7:51

                    AM, ybronhei <span dir="ltr">&lt;<a

                        moz-do-not-send="true"

                        href="mailto:ybronhei@redhat.com"

                        target="_blank">ybronhei@redhat.com</a>&gt;</span>

                    wrote:<br>

                    <blockquote class="gmail_quote" style="margin:0 0 0

                      .8ex;border-left:1px #ccc solid;padding-left:1ex">

                      <div>On 01/24/2013 12:44 AM, Dead Horse wrote:<br>

                      </div>

                      <blockquote class="gmail_quote" style="margin:0 0

                        0 .8ex;border-left:1px #ccc

                        solid;padding-left:1ex">

                        <div>

                          I narrowed down on the commit where the

                          originally reported issue crept in:<br>

                        </div>

                        commitfc3a44f71d2ef202cff18d7203b9e4165b546621building

                        and testing with

                        <div><br>

                          this commit or subsequent commits yields the

                          original issue.<br>

                        </div>

                      </blockquote>

                      Interesting.. it might be related to this commit

                      and we're trying to reproduce it.<br>

                      <br>

                      Did you try to remove that code and run again?

                      does it work without the additional of

                      zombieReaper?<br>

                      does the connectivity to the storage work well?

                      when you run 'ls' on the mounted folder you get

                      see the files without a long delay ? it might

                      related to too long timeout when validating access

                      to this mount..<br>

                      we work on that.. any additional info can help<br>

                      <br>

                      Thanks.

                      <div>

                        <div><br>

                          <blockquote class="gmail_quote"

                            style="margin:0 0 0 .8ex;border-left:1px

                            #ccc solid;padding-left:1ex">

                            <br>

                            - DHC<br>

                            <br>

                            <br>

                            On Wed, Jan 23, 2013 at 3:56 PM, Dead Horse<br>

                            &lt;<a moz-do-not-send="true"

                              href="mailto:deadhorseconsulting@gmail.com"

                              target="_blank">deadhorseconsulting@gmail.com</a>&gt;wrote:<br>

                            <br>

                            <blockquote class="gmail_quote"

                              style="margin:0 0 0 .8ex;border-left:1px

                              #ccc solid;padding-left:1ex">

                              Indeed reverting back to an older vdsm

                              clears up the above issue. However<br>

                              now I the issue is see is:<br>

                              Thread-18::ERROR::2013-01-23<br>

                              15:50:42,885::task::833::TaskManager.Task::(_setError)<br>

                              Task=`08709e68-bcbc-40d8-843a-d69d4df40ac6`::Unexpected

                              error<br>

                              <br>

                              Traceback (most recent call last):<br>

                              &nbsp; &nbsp;File "/usr/share/vdsm/storage/task.py",

                              line 840, in _run<br>

                              &nbsp; &nbsp; &nbsp;return fn(*args, **kargs)<br>

                              &nbsp; &nbsp;File "/usr/share/vdsm/logUtils.py",

                              line 42, in wrapper<br>

                              &nbsp; &nbsp; &nbsp;res = f(*args, **kwargs)<br>

                              &nbsp; &nbsp;File "/usr/share/vdsm/storage/hsm.py",

                              line 923, in connectStoragePool<br>

                              &nbsp; &nbsp; &nbsp;masterVersion, options)<br>

                              &nbsp; &nbsp;File "/usr/share/vdsm/storage/hsm.py",

                              line 970, in _connectStoragePool<br>

                              &nbsp; &nbsp; &nbsp;res = pool.connect(hostID, scsiKey,

                              msdUUID, masterVersion)<br>

                              &nbsp; &nbsp;File "/usr/share/vdsm/storage/sp.py",

                              line 643, in connect<br>

                              &nbsp; &nbsp; &nbsp;self.__rebuild(msdUUID=msdUUID,

                              masterVersion=masterVersion)<br>

                              &nbsp; &nbsp;File "/usr/share/vdsm/storage/sp.py",

                              line 1167, in __rebuild<br>

                              &nbsp; &nbsp; &nbsp;self.masterDomain =

                              self.getMasterDomain(msdUUID=msdUUID,<br>

                              masterVersion=masterVersion)<br>

                              &nbsp; &nbsp;File "/usr/share/vdsm/storage/sp.py",

                              line 1506, in getMasterDomain<br>

                              &nbsp; &nbsp; &nbsp;raise se.StoragePoolMasterNotFound(self.spUUID,

                              msdUUID)<br>

                              StoragePoolMasterNotFound: Cannot find

                              master domain:<br>

                              'spUUID=f90a0d1c-06ca-11e2-a05b-00151712f280,<br>

                              msdUUID=67534cca-1327-462a-b455-a04464084b31'<br>

                              Thread-18::DEBUG::2013-01-23<br>

                              15:50:42,887::task::852::TaskManager.Task::(_run)<br>

                              Task=`08709e68-bcbc-40d8-843a-d69d4df40ac6`::Task._run:<br>

                              08709e68-bcbc-40d8-843a-d69d4df40ac6<br>

                              ('f90a0d1c-06ca-11e2-a05b-00151712f280',

                              2,<br>

                              'f90a0d1c-06ca-11e2-a05b-00151712f280',<br>

                              '67534cca-1327-462a-b455-a04464084b31',

                              433) {} failed - stopping task<br>

                              <br>

                              This is with vdsm built from<br>

                              commit25a2d8572ad32352227c98a86631300fbd6523c1<br>

                              - DHC<br>

                              <br>

                              <br>

                              On Wed, Jan 23, 2013 at 10:44 AM, Dead

                              Horse &lt;<br>

                              <a moz-do-not-send="true"

                                href="mailto:deadhorseconsulting@gmail.com"

                                target="_blank">deadhorseconsulting@gmail.com</a>&gt;

                              wrote:<br>

                              <br>

                              <blockquote class="gmail_quote"

                                style="margin:0 0 0 .8ex;border-left:1px

                                #ccc solid;padding-left:1ex">

                                VDSM was built from:<br>

                                commit 166138e37e75767b32227746bb671b1dab9cdd5e<br>

                                <br>

                                Attached is the full vdsm log<br>

                                <br>

                                I should also note that from engine

                                perspective it sees the master<br>

                                storage domain as locked and the others

                                as unknown.<br>

                                <br>

                                <br>

                                On Wed, Jan 23, 2013 at 2:49 AM, Dan

                                Kenigsberg &lt;<a moz-do-not-send="true"

                                  href="mailto:danken@redhat.com"

                                  target="_blank">danken@redhat.com</a>&gt;wrote:<br>

                                <br>

                                <blockquote class="gmail_quote"

                                  style="margin:0 0 0

                                  .8ex;border-left:1px #ccc

                                  solid;padding-left:1ex">

                                  On Tue, Jan 22, 2013 at 04:02:24PM

                                  -0600, Dead Horse wrote:<br>

                                  <blockquote class="gmail_quote"

                                    style="margin:0 0 0

                                    .8ex;border-left:1px #ccc

                                    solid;padding-left:1ex">

                                    Any ideas on this one? (from VDSM

                                    log):<br>

                                    Thread-25::DEBUG::2013-01-22<br>

                                    15:35:29,065::BindingXMLRPC::914::vds::(wrapper)

                                    client<br>

                                  </blockquote>

                                  [3.57.111.30]::call<br>

                                  <blockquote class="gmail_quote"

                                    style="margin:0 0 0

                                    .8ex;border-left:1px #ccc

                                    solid;padding-left:1ex">

                                    getCapabilities with () {}<br>

                                    Thread-25::ERROR::2013-01-22

                                    15:35:29,113::netinfo::159::root::(speed)<br>

                                    cannot read ib0 speed<br>

                                    Traceback (most recent call last):<br>

                                    &nbsp; &nbsp;File "/usr/lib64/python2.6/site-packages/vdsm/netinfo.py",

                                    line 155,<br>

                                  </blockquote>

                                  in<br>

                                  <blockquote class="gmail_quote"

                                    style="margin:0 0 0

                                    .8ex;border-left:1px #ccc

                                    solid;padding-left:1ex">

                                    speed<br>

                                    &nbsp; &nbsp; &nbsp;s =

                                    int(file('/sys/class/net/%s/speed' %

                                    dev).read())<br>

                                    IOError: [Errno 22] Invalid argument<br>

                                    <br>

                                    Causes VDSM to fail to attach

                                    storage<br>

                                  </blockquote>

                                  <br>

                                  I doubt that this is the cause of the

                                  failure, as vdsm has always<br>

                                  reported "0" for ib devices, and still

                                  is.<br>

                                </blockquote>

                              </blockquote>

                            </blockquote>

                          </blockquote>

                        </div>

                      </div>

                      it happens only when you call to getCapabilities..

                      so it doesn't related to the flow, and it can't

                      effect the storage.<br>

                      Dan: I guess this is not the issue but why is the

                      IOError?<br>

                      <blockquote class="gmail_quote" style="margin:0 0

                        0 .8ex;border-left:1px #ccc

                        solid;padding-left:1ex">

                        <div>

                          <div>

                            <blockquote class="gmail_quote"

                              style="margin:0 0 0 .8ex;border-left:1px

                              #ccc solid;padding-left:1ex">

                              <blockquote class="gmail_quote"

                                style="margin:0 0 0 .8ex;border-left:1px

                                #ccc solid;padding-left:1ex">

                                <blockquote class="gmail_quote"

                                  style="margin:0 0 0

                                  .8ex;border-left:1px #ccc

                                  solid;padding-left:1ex">

                                  <br>

                                  Does a former version works with your

                                  Engine?<br>

                                  Could you share more of your vdsm.log?

                                  I suppose the culprit lies in one<br>

                                  one of the storage-related commands,

                                  not in statistics retrieval.<br>

                                  <br>

                                  <blockquote class="gmail_quote"

                                    style="margin:0 0 0

                                    .8ex;border-left:1px #ccc

                                    solid;padding-left:1ex">

                                    <br>

                                    Engine side sees:<br>

                                    ERROR [org.ovirt.engine.core.bll.storage.NFSStorageHelper]<br>

                                    (QuartzScheduler_Worker-96)

                                    [553ef26e] The connection with

                                    details<br>

                                    192.168.0.1:/ovirt/ds failed because

                                    of error code 100 and error<br>

                                  </blockquote>

                                  message<br>

                                  <blockquote class="gmail_quote"

                                    style="margin:0 0 0

                                    .8ex;border-left:1px #ccc

                                    solid;padding-left:1ex">

                                    is: general exception<br>

                                    2013-01-22 15:35:30,160 INFO<br>

                                    [org.ovirt.engine.core.bll.SetNonOperationalVdsCommand]<br>

                                    (QuartzScheduler_Worker-96)

                                    [1ab78378] Running command:<br>

                                    SetNonOperationalVdsCommand

                                    internal: true. Entities affected :

                                    &nbsp;ID:<br>

                                    8970b3fe-1faf-11e2-bc1f-00151712f280

                                    Type: VDS<br>

                                    2013-01-22 15:35:30,200 INFO<br>

                                    [org.ovirt.engine.core.vdsbroker.SetVdsStatusVDSCommand]<br>

                                    (QuartzScheduler_Worker-96)

                                    [1ab78378] START,<br>

                                    SetVdsStatusVDSCommand(HostName =

                                    kezan, HostId =<br>

                                    8970b3fe-1faf-11e2-bc1f-00151712f280,

                                    status=NonOperational,<br>

                                    nonOperationalReason=STORAGE_DOMAIN_UNREACHABLE),

                                    log id: 4af5c4cd<br>

                                    2013-01-22 15:35:30,211 INFO<br>

                                    [org.ovirt.engine.core.vdsbroker.SetVdsStatusVDSCommand]<br>

                                    (QuartzScheduler_Worker-96)

                                    [1ab78378] FINISH,

                                    SetVdsStatusVDSCommand,<br>

                                  </blockquote>

                                  log<br>

                                  <blockquote class="gmail_quote"

                                    style="margin:0 0 0

                                    .8ex;border-left:1px #ccc

                                    solid;padding-left:1ex">

                                    id: 4af5c4cd<br>

                                    2013-01-22 15:35:30,242 ERROR<br>

                                    [org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector]<br>

                                    (QuartzScheduler_Worker-96)

                                    [1ab78378] Try to add duplicate

                                    audit log<br>

                                    values with the same name. Type:

                                    VDS_SET_NONOPERATIONAL_DOMAIN.

                                    Value:<br>

                                    storagepoolname<br>

                                    <br>

                                    Engine = latest master<br>

                                    VDSM = latest master<br>

                                  </blockquote>

                                  <br>

                                  Since "latest master" is an unstable

                                  reference by definition, I'm sure<br>

                                  that History would thank you if you

                                  post the exact version (git hash?)<br>

                                  of the code.<br>

                                  <br>

                                  <blockquote class="gmail_quote"

                                    style="margin:0 0 0

                                    .8ex;border-left:1px #ccc

                                    solid;padding-left:1ex">

                                    node = el6<br>

                                  </blockquote>

                                  <br>

                                  <br>

                                </blockquote>

                                <br>

                              </blockquote>

                              <br>

                            </blockquote>

                            <br>

                            <br>

                            <br>

                          </div>

                        </div>

                        _______________________________________________<br>

                        Users mailing list<br>

                        <a moz-do-not-send="true"

                          href="mailto:Users@ovirt.org" target="_blank">Users@ovirt.org</a><br>

                        <a moz-do-not-send="true"

                          href="http://lists.ovirt.org/mailman/listinfo/users"

                          target="_blank">http://lists.ovirt.org/mailman/listinfo/users</a><br>

                        <br>

                        <span><font color="#888888">

                          </font></span></blockquote>

                      <span><font color="#888888">

                          <br>

                          <br>

                          -- <br>

                          Yaniv Bronhaim.<br>

                          RedHat, Israel<br>

                          09-7692289<br>

                          054-7744187<br>

                        </font></span></blockquote>

                  </div>

                  <br>

                </div>

              </div>

            </div>

          </blockquote>

        </div>

        <br>

      </div>

    </blockquote>

    <br>

    <br>

  </body>

</html>