[
https://ovirt-jira.atlassian.net/browse/OVIRT-938?page=com.atlassian.jira...
]
Barak Korren commented on OVIRT-938:
------------------------------------
Trying to get more info about this, here are the journal lines we get on the slave when
the job fails:
{code}
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org libvirtd[17504]: End of file while reading
data: Input/output error
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: USER_END pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: USER_LOGOUT pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=1106
audit(1482333814.930:3694): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=1113
audit(1482333814.930:3695): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=1112
audit(1482333814.952:3696): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=1105
audit(1482333814.952:3697): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: USER_LOGIN pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: USER_START pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy
kind=server
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
direction=? spid=25269 suid=1012 exe="/usr/sbin/sshd" hostname=?
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=2404
audit(1482333814.953:3698): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy kind=server
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
direction=? spid=25269 suid=1012 exe="/usr/sbin/sshd" hostname=?
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org sshd[5421]: Received disconnect from
66.187.230.92 port 53024:11: Closed due to user request.
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=2404
audit(1482333814.989:3699): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy kind=server
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
direction=? spid=5421 suid=1012 exe="/usr/sbin/sshd" hostname=?
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=2404
audit(1482333814.989:3700): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy kind=session fp=?
direction=both spid=5421 suid=1012 rport=53024 laddr=172.19.12.136 lport=22
exe="/usr/sbin/sshd" hostname=? addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy
kind=server
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
direction=? spid=5421 suid=1012 exe="/usr/sbin/sshd" hostname=?
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy
kind=session fp=? direction=both spid=5421 suid=1012 rport=53024 laddr=172.19.12.136
lport=22 exe="/usr/sbin/sshd" hostname=? addr=66.187.230.92 terminal=?
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org sshd[5421]: Disconnected from 66.187.230.92
port 53024
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=1106
audit(1482333814.995:3701): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=PAM:session_close
grantors=pam_selinux,pam_loginuid,pam_selinux,pam_namespace,pam_keyinit,pam_keyinit,pam_limits,pam_systemd,pam_unix,pam_lastlog
acct="jenkins" exe="/usr/sbin/sshd" hostname=66.187.230.92
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=1104
audit(1482333814.995:3702): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=PAM:setcred
grantors=pam_env,pam_unix acct="jenkins" exe="/usr/sbin/sshd"
hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org kernel: audit: type=1106
audit(1482333814.996:3703): pid=5410 uid=0 auid=1012 ses=3
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: USER_END pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=PAM:session_close
grantors=pam_selinux,pam_loginuid,pam_selinux,pam_namespace,pam_keyinit,pam_keyinit,pam_limits,pam_systemd,pam_unix,pam_lastlog
acct="jenkins" exe="/usr/sbin/sshd" hostname=66.187.230.92
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: CRED_DISP pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=PAM:setcred
grantors=pam_env,pam_unix acct="jenkins" exe="/usr/sbin/sshd"
hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: USER_END pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: USER_LOGOUT pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh
res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy
kind=server
fp=SHA256:c1:30:1f:20:35:5f:1a:4a:cc:9f:c9:eb:05:0c:0f:41:06:32:52:d7:d3:e9:fc:0a:aa:ae:f4:ce:98:15:b3:4c
direction=? spid=5410 suid=0 exe="/usr/sbin/sshd" hostname=? addr=66.187.230.92
terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy
kind=server
fp=SHA256:3d:5f:aa:8f:5f:6d:01:ba:8a:6c:ca:2a:ae:4e:78:f6:19:cd:44:7c:df:6a:4d:58:ef:be:16:c0:41:b9:a8:a4
direction=? spid=5410 suid=0 exe="/usr/sbin/sshd" hostname=? addr=66.187.230.92
terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER pid=5410 uid=0
auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy
kind=server
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
direction=? spid=5410 suid=0 exe="/usr/sbin/sshd" hostname=? addr=66.187.230.92
terminal=? res=success'
Dec 21 15:23:34
vm0136.workers-phx.ovirt.org sshd[5410]: pam_unix(sshd:session): session
closed for user jenkins
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd-logind[444]: Removed session 3.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[1]: Stopping User Manager for UID
1012...
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target Default.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target Basic System.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target Timers.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target Paths.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target Sockets.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Closed D-Bus User Message Bus
Socket.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Reached target Shutdown.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Starting Exit the Session...
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[5414]: Received SIGRTMIN+24 from PID
25273 (kill).
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[1]: Stopped User Manager for UID
1012.
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org audit[1]: SERVICE_STOP pid=1 uid=0
auid=4294967295 ses=4294967295 subj=system_u:system_r:init_t:s0 msg='unit=user@1012
comm="systemd" exe="/usr/lib/systemd/systemd" hostname=? addr=?
terminal=? res=success'
Dec 21 15:23:35
vm0136.workers-phx.ovirt.org systemd[1]: Removed slice User Slice of
jenkins.
{code}
Fix Jenkins slave connection dying on vdsm check_merged jobs
------------------------------------------------------------
Key: OVIRT-938
URL:
https://ovirt-jira.atlassian.net/browse/OVIRT-938
Project: oVirt - virtualization made easy
Issue Type: Bug
Components: Jenkins
Reporter: Barak Korren
Assignee: infra
Something in the vdsm build_artifacs job makes the Jenkins slave disconnect when it is
running. This in turn makes the cleanup scripts not run on the slave leaving it dirty
enough to make the next job on that slave fail.
Example of this can be seen here:
http://jenkins.ovirt.org/job/vdsm_master_check-merged-el7-x86_64/692/console
Relevant log lines:
{code}
21:49:00 Ran 44 tests in 1231.988s
21:49:00
21:49:00 OK
21:49:00 + return 0
21:49:00 sh: [13086: 1 (255)] tcsetattr: Inappropriate ioctl for device
21:49:00 Took 2464 seconds
21:49:00 ===================================
21:49:00 logout
21:49:01 Slave went offline during the build
21:49:01 ERROR: Connection was broken: java.io.IOException: Unexpected termination of the
channel
21:49:01 at
hudson.remoting.SynchronousCommandTransport$ReaderThread.run(SynchronousCommandTransport.java:50)
21:49:01 Caused by: java.io.EOFException
21:49:01 at
java.io.ObjectInputStream$PeekInputStream.readFully(ObjectInputStream.java:2353)
21:49:01 at
java.io.ObjectInputStream$BlockDataInputStream.readShort(ObjectInputStream.java:2822)
21:49:01 at java.io.ObjectInputStream.readStreamHeader(ObjectInputStream.java:804)
21:49:01 at java.io.ObjectInputStream.<init>(ObjectInputStream.java:301)
21:49:01 at
hudson.remoting.ObjectInputStreamEx.<init>(ObjectInputStreamEx.java:48)
21:49:01 at
hudson.remoting.AbstractSynchronousByteArrayCommandTransport.read(AbstractSynchronousByteArrayCommandTransport.java:34)
21:49:01 at
hudson.remoting.SynchronousCommandTransport$ReaderThread.run(SynchronousCommandTransport.java:48)
21:49:01
21:49:01 Build step 'Execute shell' marked build as failure
21:49:01 Performing Post build task...
21:49:01 Match found for :.* : True
21:49:01 Logical operation result is TRUE
21:49:01 Running script : #!/bin/bash -x
21:49:01 echo "shell-scripts/mock_cleanup.sh"
... SNIP ...
21:49:01 Exception when executing the batch command : no workspace from node
hudson.slaves.DumbSlave[fc24-vm06.phx.ovirt.org] which is computer
hudson.slaves.SlaveComputer@30863c81 and has channel null
21:49:01 Build step 'Post build task' marked build as failure
21:49:02 ERROR: Step ?Archive the artifacts? failed: no workspace for
vdsm_master_check-merged-el7-x86_64 #692
21:49:02 ERROR: Failed to evaluate groovy script.
21:49:02 java.lang.NullPointerException: Cannot invoke method child() on null object
21:49:02 at org.codehaus.groovy.runtime.NullObject.invokeMethod(NullObject.java:77)
21:49:02 at
org.codehaus.groovy.runtime.callsite.PogoMetaClassSite.call(PogoMetaClassSite.java:45)
21:49:02 at
org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:42)
21:49:02 at
org.codehaus.groovy.runtime.callsite.NullCallSite.call(NullCallSite.java:32)
21:49:02 at
org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:42)
21:49:02 at
org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:108)
21:49:02 at
org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:116)
21:49:02 at Script1.run(Script1.groovy:2)
21:49:02 at groovy.lang.GroovyShell.evaluate(GroovyShell.java:580)
21:49:02 at groovy.lang.GroovyShell.evaluate(GroovyShell.java:618)
21:49:02 at groovy.lang.GroovyShell.evaluate(GroovyShell.java:589)
21:49:02 at
org.jenkinsci.plugins.scriptsecurity.sandbox.groovy.SecureGroovyScript.evaluate(SecureGroovyScript.java:166)
21:49:02 at
org.jvnet.hudson.plugins.groovypostbuild.GroovyPostbuildRecorder.perform(GroovyPostbuildRecorder.java:361)
21:49:02 at hudson.tasks.BuildStepMonitor$1.perform(BuildStepMonitor.java:20)
21:49:02 at
hudson.model.AbstractBuild$AbstractBuildExecution.perform(AbstractBuild.java:782)
21:49:02 at
hudson.model.AbstractBuild$AbstractBuildExecution.performAllBuildSteps(AbstractBuild.java:723)
21:49:02 at hudson.model.Build$BuildExecution.post2(Build.java:185)
21:49:02 at
hudson.model.AbstractBuild$AbstractBuildExecution.post(AbstractBuild.java:668)
21:49:02 at hudson.model.Run.execute(Run.java:1763)
21:49:02 at hudson.model.FreeStyleBuild.run(FreeStyleBuild.java:43)
21:49:02 at hudson.model.ResourceController.execute(ResourceController.java:98)
21:49:02 at hudson.model.Executor.run(Executor.java:410)
21:49:02 Build step 'Groovy Postbuild' marked build as failure
21:49:02 Started calculate disk usage of build
21:49:02 Finished Calculation of disk usage of build in 0 seconds
21:49:02 Finished: FAILURE
{code}
--
This message was sent by Atlassian JIRA
(v1000.621.5#100023)