Submitter | Matt Harbison |
---|---|
Date | March 29, 2018, 2:45 a.m. |
Message ID | <7a6eccc3b43dd8f6090b.1522291530@Envy> |
Download | mbox | patch |
Permalink | /patch/29930/ |
State | Accepted |
Headers | show |
Comments
On Wed, 28 Mar 2018 22:45:30 -0400, Matt Harbison wrote: > # HG changeset patch > # User Matt Harbison <matt_harbison@yahoo.com> > # Date 1522210269 14400 > # Wed Mar 28 00:11:09 2018 -0400 > # Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc > # Parent 8bac14ce57781d2a45b6d036b3614b1a21917f0d > server: add an error feedback mechanism for when the daemon fails to launch > > There's a recurring problem on Windows where `hg serve -d` will randomly fail to > spawn a detached process. The reason for the failure is completely hidden, and > it takes hours to get a single failure on my laptop. All this does is redirect > stdout/stderr of the child to a file until the lock file is freed, and then the > parent dumps it out if it fails to spawn. Abusing the lockfile is fine, but ... > diff --git a/mercurial/server.py b/mercurial/server.py > --- a/mercurial/server.py > +++ b/mercurial/server.py > @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init > runargs=None, appendpid=False): > '''Run a command as a service.''' > > + # When daemonized, redirect stdout/stderr to the lockfile (which gets > + # cleaned up after the child is up and running), so that the parent can read > + # and print the error if this child dies early. > + if opts['daemon_postexec']: > + for inst in opts['daemon_postexec']: > + if inst.startswith('unlink:'): > + lockpath = inst[7:] > + procutil.stdout.flush() > + procutil.stderr.flush() > + logfilefp = open(lockpath, 'w+') Nit: os.open(lockpath, os.O_WRONLY | os.O_APPEND | os.O_BINARY) is probably better as it enforces that the lock was created by the parent process. > + os.dup2(logfilefp.fileno(), 1) > + os.dup2(logfilefp.fileno(), 2) > + logfilefp.close() > @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init > try: > if not runargs: > runargs = procutil.hgcmd() + pycompat.sysargv[1:] > + # Only hg processes accept --traceback (e.g., not dumbhttp.py) > + runargs.append('--traceback') Doesn't it enable the traceback permanently? The --traceback option should be specified explicitly only when needed. > runargs.append('--daemon-postexec=unlink:%s' % lockpath) > # Don't pass --cwd to the child process, because we've already > # changed directory. > @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init > return not os.path.exists(lockpath) > pid = procutil.rundetached(runargs, condfn) > if pid < 0: > + # If the daemonized process managed to write out an error msg, > + # report it. > + if os.path.exists(lockpath): > + with open(lockpath) as log: > + for line in log: > + procutil.stderr.write('server: %s' % line) Event without this hack, stdout/stderr of the child should be visible (at least on Unix.) They aren't redirected until the server starts successfully. > - procutil.hidewindow() > - procutil.stdout.flush() > - procutil.stderr.flush() > + > + if lockpath: > + procutil.hidewindow() > + procutil.stdout.flush() > + procutil.stderr.flush() > > - nullfd = os.open(os.devnull, os.O_RDWR) > - logfilefd = nullfd > - if logfile: > - logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT | os.O_APPEND, > - 0o666) > - os.dup2(nullfd, 0) > - os.dup2(logfilefd, 1) > - os.dup2(logfilefd, 2) > - if nullfd not in (0, 1, 2): > - os.close(nullfd) > - if logfile and logfilefd not in (0, 1, 2): > - os.close(logfilefd) > + nullfd = os.open(os.devnull, os.O_RDWR) > + logfilefd = nullfd > + if logfile: > + logfilefd = os.open(logfile, > + os.O_RDWR | os.O_CREAT | os.O_APPEND, > + 0o666) > + os.dup2(nullfd, 0) > + os.dup2(logfilefd, 1) > + os.dup2(logfilefd, 2) > + if nullfd not in (0, 1, 2): > + os.close(nullfd) > + if logfile and logfilefd not in (0, 1, 2): > + os.close(logfilefd) > + > + # Only unlink after redirecting stdout/stderr, so Windows doesn't > + # complain about a sharing violation. > + os.unlink(lockpath) Wheter lockpath is specified or not, stdout/stderr of the child must be redirected to /dev/null.
On Thu, 29 Mar 2018 07:26:04 -0400, Yuya Nishihara <yuya@tcha.org> wrote: > On Wed, 28 Mar 2018 22:45:30 -0400, Matt Harbison wrote: >> # HG changeset patch >> # User Matt Harbison <matt_harbison@yahoo.com> >> # Date 1522210269 14400 >> # Wed Mar 28 00:11:09 2018 -0400 >> # Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc >> # Parent 8bac14ce57781d2a45b6d036b3614b1a21917f0d >> server: add an error feedback mechanism for when the daemon fails to >> launch >> >> There's a recurring problem on Windows where `hg serve -d` will >> randomly fail to >> spawn a detached process. The reason for the failure is completely >> hidden, and >> it takes hours to get a single failure on my laptop. All this does is >> redirect >> stdout/stderr of the child to a file until the lock file is freed, and >> then the >> parent dumps it out if it fails to spawn. > > Abusing the lockfile is fine, but ... > >> diff --git a/mercurial/server.py b/mercurial/server.py >> --- a/mercurial/server.py >> +++ b/mercurial/server.py >> @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init >> runargs=None, appendpid=False): >> '''Run a command as a service.''' >> >> + # When daemonized, redirect stdout/stderr to the lockfile (which >> gets >> + # cleaned up after the child is up and running), so that the >> parent can read >> + # and print the error if this child dies early. >> + if opts['daemon_postexec']: >> + for inst in opts['daemon_postexec']: >> + if inst.startswith('unlink:'): >> + lockpath = inst[7:] >> + procutil.stdout.flush() >> + procutil.stderr.flush() >> + logfilefp = open(lockpath, 'w+') > > Nit: os.open(lockpath, os.O_WRONLY | os.O_APPEND | os.O_BINARY) is > probably > better as it enforces that the lock was created by the parent process. > >> + os.dup2(logfilefp.fileno(), 1) >> + os.dup2(logfilefp.fileno(), 2) >> + logfilefp.close() > >> @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init >> try: >> if not runargs: >> runargs = procutil.hgcmd() + pycompat.sysargv[1:] >> + # Only hg processes accept --traceback (e.g., not >> dumbhttp.py) >> + runargs.append('--traceback') > > Doesn't it enable the traceback permanently? The --traceback option > should be > specified explicitly only when needed. Yes. The errors happen in random locations, so I didn't want to sprinkle this on command lines all over. I didn't see a way to detect if it was running under the test harness, but maybe it's OK to conditionalize this based on some environment variable that it sets? >> runargs.append('--daemon-postexec=unlink:%s' % lockpath) >> # Don't pass --cwd to the child process, because we've >> already >> # changed directory. >> @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init >> return not os.path.exists(lockpath) >> pid = procutil.rundetached(runargs, condfn) >> if pid < 0: >> + # If the daemonized process managed to write out an >> error msg, >> + # report it. >> + if os.path.exists(lockpath): >> + with open(lockpath) as log: >> + for line in log: >> + procutil.stderr.write('server: %s' % line) > > Event without this hack, stdout/stderr of the child should be visible (at > least on Unix.) They aren't redirected until the server starts > successfully. Yeah, it looks like it's a Windows problem. See 594dd384803c. I wasn't sure if it's better to special case Windows, but I don't have a problem doing that. >> - procutil.hidewindow() >> - procutil.stdout.flush() >> - procutil.stderr.flush() >> + >> + if lockpath: >> + procutil.hidewindow() >> + procutil.stdout.flush() >> + procutil.stderr.flush() >> >> - nullfd = os.open(os.devnull, os.O_RDWR) >> - logfilefd = nullfd >> - if logfile: >> - logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT | >> os.O_APPEND, >> - 0o666) >> - os.dup2(nullfd, 0) >> - os.dup2(logfilefd, 1) >> - os.dup2(logfilefd, 2) >> - if nullfd not in (0, 1, 2): >> - os.close(nullfd) >> - if logfile and logfilefd not in (0, 1, 2): >> - os.close(logfilefd) >> + nullfd = os.open(os.devnull, os.O_RDWR) >> + logfilefd = nullfd >> + if logfile: >> + logfilefd = os.open(logfile, >> + os.O_RDWR | os.O_CREAT | >> os.O_APPEND, >> + 0o666) >> + os.dup2(nullfd, 0) >> + os.dup2(logfilefd, 1) >> + os.dup2(logfilefd, 2) >> + if nullfd not in (0, 1, 2): >> + os.close(nullfd) >> + if logfile and logfilefd not in (0, 1, 2): >> + os.close(logfilefd) >> + >> + # Only unlink after redirecting stdout/stderr, so Windows >> doesn't >> + # complain about a sharing violation. >> + os.unlink(lockpath) > > Wheter lockpath is specified or not, stdout/stderr of the child must be > redirected to /dev/null. That should be easy enough to fix. I was under the impression that the lock file is mandatory.
On Thu, 29 Mar 2018 23:04:39 -0400, Matt Harbison wrote: > On Thu, 29 Mar 2018 07:26:04 -0400, Yuya Nishihara <yuya@tcha.org> wrote: > > > On Wed, 28 Mar 2018 22:45:30 -0400, Matt Harbison wrote: > >> # HG changeset patch > >> # User Matt Harbison <matt_harbison@yahoo.com> > >> # Date 1522210269 14400 > >> # Wed Mar 28 00:11:09 2018 -0400 > >> # Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc > >> # Parent 8bac14ce57781d2a45b6d036b3614b1a21917f0d > >> server: add an error feedback mechanism for when the daemon fails to > >> launch > >> > >> There's a recurring problem on Windows where `hg serve -d` will > >> randomly fail to > >> spawn a detached process. The reason for the failure is completely > >> hidden, and > >> it takes hours to get a single failure on my laptop. All this does is > >> redirect > >> stdout/stderr of the child to a file until the lock file is freed, and > >> then the > >> parent dumps it out if it fails to spawn. > > > > Abusing the lockfile is fine, but ... > > > >> diff --git a/mercurial/server.py b/mercurial/server.py > >> --- a/mercurial/server.py > >> +++ b/mercurial/server.py > >> @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init > >> runargs=None, appendpid=False): > >> '''Run a command as a service.''' > >> > >> + # When daemonized, redirect stdout/stderr to the lockfile (which > >> gets > >> + # cleaned up after the child is up and running), so that the > >> parent can read > >> + # and print the error if this child dies early. > >> + if opts['daemon_postexec']: > >> + for inst in opts['daemon_postexec']: > >> + if inst.startswith('unlink:'): > >> + lockpath = inst[7:] > >> + procutil.stdout.flush() > >> + procutil.stderr.flush() > >> + logfilefp = open(lockpath, 'w+') > > > > Nit: os.open(lockpath, os.O_WRONLY | os.O_APPEND | os.O_BINARY) is > > probably > > better as it enforces that the lock was created by the parent process. > > > >> + os.dup2(logfilefp.fileno(), 1) > >> + os.dup2(logfilefp.fileno(), 2) > >> + logfilefp.close() > > > >> @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init > >> try: > >> if not runargs: > >> runargs = procutil.hgcmd() + pycompat.sysargv[1:] > >> + # Only hg processes accept --traceback (e.g., not > >> dumbhttp.py) > >> + runargs.append('--traceback') > > > > Doesn't it enable the traceback permanently? The --traceback option > > should be > > specified explicitly only when needed. > > Yes. The errors happen in random locations, so I didn't want to sprinkle > this on command lines all over. I didn't see a way to detect if it was > running under the test harness, but maybe it's OK to conditionalize this > based on some environment variable that it sets? If you want to enable --traceback temporarily, maybe you can add it to the [defaults] or [alias]. [defaults] serve = --traceback Alternatively, we can add devel.* flag to enable the traceback only for HTTP server, for example. > >> @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init > >> return not os.path.exists(lockpath) > >> pid = procutil.rundetached(runargs, condfn) > >> if pid < 0: > >> + # If the daemonized process managed to write out an > >> error msg, > >> + # report it. > >> + if os.path.exists(lockpath): > >> + with open(lockpath) as log: > >> + for line in log: > >> + procutil.stderr.write('server: %s' % line) > > > > Event without this hack, stdout/stderr of the child should be visible (at > > least on Unix.) They aren't redirected until the server starts > > successfully. > > Yeah, it looks like it's a Windows problem. See 594dd384803c. I wasn't > sure if it's better to special case Windows, but I don't have a problem > doing that. I prefer enabling it conditionally with some inline comments. > >> + os.dup2(nullfd, 0) > >> + os.dup2(logfilefd, 1) > >> + os.dup2(logfilefd, 2) > >> + if nullfd not in (0, 1, 2): > >> + os.close(nullfd) > >> + if logfile and logfilefd not in (0, 1, 2): > >> + os.close(logfilefd) > >> + > >> + # Only unlink after redirecting stdout/stderr, so Windows > >> doesn't > >> + # complain about a sharing violation. > >> + os.unlink(lockpath) > > > > Wheter lockpath is specified or not, stdout/stderr of the child must be > > redirected to /dev/null. > > That should be easy enough to fix. I was under the impression that the > lock file is mandatory. Null redirection is for the daemonized processes, not for the lock file. And IIRC, chg spanws a daemon without the unlink:<path> option.
Patch
diff --git a/mercurial/server.py b/mercurial/server.py --- a/mercurial/server.py +++ b/mercurial/server.py @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init runargs=None, appendpid=False): '''Run a command as a service.''' + # When daemonized, redirect stdout/stderr to the lockfile (which gets + # cleaned up after the child is up and running), so that the parent can read + # and print the error if this child dies early. + if opts['daemon_postexec']: + for inst in opts['daemon_postexec']: + if inst.startswith('unlink:'): + lockpath = inst[7:] + procutil.stdout.flush() + procutil.stderr.flush() + logfilefp = open(lockpath, 'w+') + os.dup2(logfilefp.fileno(), 1) + os.dup2(logfilefp.fileno(), 2) + logfilefp.close() + def writepid(pid): if opts['pid_file']: if appendpid: @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init try: if not runargs: runargs = procutil.hgcmd() + pycompat.sysargv[1:] + # Only hg processes accept --traceback (e.g., not dumbhttp.py) + runargs.append('--traceback') runargs.append('--daemon-postexec=unlink:%s' % lockpath) # Don't pass --cwd to the child process, because we've already # changed directory. @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init return not os.path.exists(lockpath) pid = procutil.rundetached(runargs, condfn) if pid < 0: + # If the daemonized process managed to write out an error msg, + # report it. + if os.path.exists(lockpath): + with open(lockpath) as log: + for line in log: + procutil.stderr.write('server: %s' % line) raise error.Abort(_('child process failed to start')) writepid(pid) finally: @@ -81,31 +103,39 @@ def runservice(opts, parentfn=None, init os.setsid() except AttributeError: pass + + lockpath = None for inst in opts['daemon_postexec']: if inst.startswith('unlink:'): lockpath = inst[7:] - os.unlink(lockpath) elif inst.startswith('chdir:'): os.chdir(inst[6:]) elif inst != 'none': raise error.Abort(_('invalid value for --daemon-postexec: %s') % inst) - procutil.hidewindow() - procutil.stdout.flush() - procutil.stderr.flush() + + if lockpath: + procutil.hidewindow() + procutil.stdout.flush() + procutil.stderr.flush() - nullfd = os.open(os.devnull, os.O_RDWR) - logfilefd = nullfd - if logfile: - logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT | os.O_APPEND, - 0o666) - os.dup2(nullfd, 0) - os.dup2(logfilefd, 1) - os.dup2(logfilefd, 2) - if nullfd not in (0, 1, 2): - os.close(nullfd) - if logfile and logfilefd not in (0, 1, 2): - os.close(logfilefd) + nullfd = os.open(os.devnull, os.O_RDWR) + logfilefd = nullfd + if logfile: + logfilefd = os.open(logfile, + os.O_RDWR | os.O_CREAT | os.O_APPEND, + 0o666) + os.dup2(nullfd, 0) + os.dup2(logfilefd, 1) + os.dup2(logfilefd, 2) + if nullfd not in (0, 1, 2): + os.close(nullfd) + if logfile and logfilefd not in (0, 1, 2): + os.close(logfilefd) + + # Only unlink after redirecting stdout/stderr, so Windows doesn't + # complain about a sharing violation. + os.unlink(lockpath) if runfn: return runfn() diff --git a/tests/test-serve.t b/tests/test-serve.t --- a/tests/test-serve.t +++ b/tests/test-serve.t @@ -47,8 +47,8 @@ With -v and -p daytime (should fail beca #if no-root no-windows $ KILLQUIETLY=Y - $ hgserve -p daytime - abort: cannot start server at 'localhost:13': Permission denied + $ hgserve -p daytime 2>&1 | egrep -v '^server: (Traceback|Abort| )' + server: abort: cannot start server at 'localhost:13': Permission denied abort: child process failed to start % errors $ KILLQUIETLY=N