Date: Sun, 24 Jul 2011 23:02:24 +0000 (UTC) From: Mark Linimon <linimon@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r224298 - projects/portbuild/scripts Message-ID: <201107242302.p6ON2OJQ027876@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: linimon (doc,ports committer) Date: Sun Jul 24 23:02:24 2011 New Revision: 224298 URL: http://svn.freebsd.org/changeset/base/224298 Log: Add a great deal of error handling. A specific case that causes pollmachine to go catatonic seems to be the "disk full" condition. Modified: projects/portbuild/scripts/pollmachine Modified: projects/portbuild/scripts/pollmachine ============================================================================== --- projects/portbuild/scripts/pollmachine Sun Jul 24 20:09:42 2011 (r224297) +++ projects/portbuild/scripts/pollmachine Sun Jul 24 23:02:24 2011 (r224298) @@ -2,7 +2,7 @@ # # pollmachine # -# Monitors build machines and notifies qmgr of changes +# Monitors build machines and notifies qmanager of changes # # pollmachine [options] [arch] ... @@ -16,7 +16,7 @@ # # TODO: -# XXX qmgr notification of new/removed machines +# XXX qmanager notification of new/removed machines # XXX counter before declaring a machine as dead # Declares a machine as online if it reports 0 data from infoseek? @@ -28,6 +28,10 @@ import sys, threading, socket from time import sleep import os, subprocess, logging +EXPECTED_LINES = 6 + +DEBUG=False + pbc = os.getenv('PORTBUILD_CHECKOUT') \ if os.getenv('PORTBUILD_CHECKOUT') else "/var/portbuild" pbd = os.getenv('PORTBUILD_DATA') \ @@ -122,25 +126,42 @@ class MachinePoll(threading.Thread): try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(60) - s.connect((self.host, self.port)) + retval = s.connect_ex((self.host, self.port)) + if retval != 0: + if self.online: + logging.info("[%s] Connection error: %s" % (self.mach, `retval`)) + self.timeouts += 1 + else: + if DEBUG: + logging.info("%s connected to socket for %s" % ( str(self), self.mach )) - data = "" - while len(data) < 65536: - chunk = s.recv(8192) - if not chunk: - break - data += chunk - - nowonline = True - self.timeouts = 0 - lines = data.split("\n") + data = "" + while len(data) < 65536: + chunk = s.recv(8192) + if not chunk: + break + data += chunk + + if DEBUG: + logging.info("%s: len(data) = %d" % (self.mach, len(data))) + if len(data) > 0: + lines = data.split("\n") + if len(lines) >= EXPECTED_LINES: + nowonline = True + self.timeouts = 0 + else: + # XXX MCL + if DEBUG or True: + logging.info("%s: truncated reply: %s" % (self.mach, lines)) except socket.timeout: if self.online: logging.info("[%s] Connection timeout" % self.mach) self.timeouts += 1 if self.timeouts < 3: nowonline = self.online - except: + except Exception, e: + print "pollmachine: exception in poll for %s:" %self.mach + print e pass finally: try: @@ -153,7 +174,7 @@ class MachinePoll(threading.Thread): self.online = nowonline if self.online: self.timeouts = 0 - # XXX inform qmgr of state change + # XXX inform qmanager of state change if self.online and not lines and not self.timeouts: # reportload script is missing @@ -180,7 +201,7 @@ class MachinePoll(threading.Thread): if old != part[2]: self.vars[part[0]] = part[2] # logging.info("%s@%s: \"%s\" -> \"%s\"" % (part[0], self.mach, old, part[2])) - # XXX update qmgr + # XXX inform qmanager try: envs = self.vars['buildenvs'] @@ -222,16 +243,31 @@ class MachinePoll(threading.Thread): except KeyError: pass + if DEBUG: + logging.info("%s recording current system load for %s" % ( str(self), self.mach )) # Record current system load + # note: can fail on "file system full" try: f = file("%s/%s/loads/%s" % (pbd, self.arch, self.mach), "w") - except: + except Exception, e: + print "pollmachine: exception in creating %s/%s/loads/%s:" % (pbd, self.arch, self.mach) + print e return try: - f.write("%s %s\n" % (self.vars['jobs'], self.vars['load'])) - except: - pass - f.close() + if 'jobs' in self.vars and 'load' in self.vars: + f.write("%s %s\n" % (self.vars['jobs'], self.vars['load'])) + else: + # machine is not responding to poll. + # XXX MCL remove from machines + # XXX inform qmanager + f.write("") + f.close() + except Exception, e: + print "pollmachine: exception in writing %s/%s/loads/%s:" % (pbd, self.arch, self.mach) + print self.vars + print e + if DEBUG: + logging.info("%s finished polling for %s" % ( str(self), self.mach )) def setup(self, branch, buildid, args = ""): cmd = "su ports-%s -c \"%s/scripts/dosetupnode %s %s %s %s %s\""\ @@ -275,13 +311,13 @@ while True: for mach in gone: logging.info("Removing machine %s/%s" % (arch, mach)) - # XXX disable from qmgr + # XXX disable from qmanager pollthreads[mach].shutdown=True del pollthreads[mach] for mach in new: logging.info("Adding machine %s/%s" % (arch, mach)) - # XXX set up qmgr + # XXX set up qmanager pc="%s/%s/portbuild.conf" % (pbd, arch) pch="%s/%s/portbuild.%s" % (pbd, arch, mach) @@ -303,4 +339,10 @@ while True: if not polldelay: break + if DEBUG: + logging.info("Ready to sleep") sleep(polldelay) + if DEBUG: + logging.info("Wakeup") + +logging.info("pollmachine: exiting.")
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201107242302.p6ON2OJQ027876>