Patchwork [1,of,2] server: add an error feedback mechanism for when the daemon fails to launch

login
register
mail settings
Submitter Matt Harbison
Date March 29, 2018, 2:45 a.m.
Message ID <7a6eccc3b43dd8f6090b.1522291530@Envy>
Download mbox | patch
Permalink /patch/29930/
State Accepted
Headers show

Comments

Matt Harbison - March 29, 2018, 2:45 a.m.
# HG changeset patch
# User Matt Harbison <matt_harbison@yahoo.com>
# Date 1522210269 14400
#      Wed Mar 28 00:11:09 2018 -0400
# Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc
# Parent  8bac14ce57781d2a45b6d036b3614b1a21917f0d
server: add an error feedback mechanism for when the daemon fails to launch

There's a recurring problem on Windows where `hg serve -d` will randomly fail to
spawn a detached process.  The reason for the failure is completely hidden, and
it takes hours to get a single failure on my laptop.  All this does is redirect
stdout/stderr of the child to a file until the lock file is freed, and then the
parent dumps it out if it fails to spawn.

I chose to put the output into the lock file because that is always cleaned up.
There's no way to report errors after that anyway.  On Windows, killdaemons.py
is roughly `kill -9`, so this ensures that junk won't pile up.

This may end up being a case of EADDRINUSE.  At least that's what I saw spit out
a few times (among other odd errors and missing output on Windows).  But I also
managed to get the same thing on Fedora 26 by running test-hgwebdir.t with
--loop -j10 for several hours.  Running `netstat` immediately after killing that
run printed a wall of sockets in the TIME_WAIT state, which were gone a couple
seconds later.  I couldn't match up ports that failed, because --loop doesn't
print out the message about the port that was used.  So maybe the fix is to
rotate the use of HGPORT[12] in the tests.  But, let's collect some more data
first.
Yuya Nishihara - March 29, 2018, 11:26 a.m.
On Wed, 28 Mar 2018 22:45:30 -0400, Matt Harbison wrote:
> # HG changeset patch
> # User Matt Harbison <matt_harbison@yahoo.com>
> # Date 1522210269 14400
> #      Wed Mar 28 00:11:09 2018 -0400
> # Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc
> # Parent  8bac14ce57781d2a45b6d036b3614b1a21917f0d
> server: add an error feedback mechanism for when the daemon fails to launch
> 
> There's a recurring problem on Windows where `hg serve -d` will randomly fail to
> spawn a detached process.  The reason for the failure is completely hidden, and
> it takes hours to get a single failure on my laptop.  All this does is redirect
> stdout/stderr of the child to a file until the lock file is freed, and then the
> parent dumps it out if it fails to spawn.

Abusing the lockfile is fine, but ...

> diff --git a/mercurial/server.py b/mercurial/server.py
> --- a/mercurial/server.py
> +++ b/mercurial/server.py
> @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init
>                 runargs=None, appendpid=False):
>      '''Run a command as a service.'''
>  
> +    # When daemonized, redirect stdout/stderr to the lockfile (which gets
> +    # cleaned up after the child is up and running), so that the parent can read
> +    # and print the error if this child dies early.
> +    if opts['daemon_postexec']:
> +        for inst in opts['daemon_postexec']:
> +            if inst.startswith('unlink:'):
> +                lockpath = inst[7:]
> +                procutil.stdout.flush()
> +                procutil.stderr.flush()
> +                logfilefp = open(lockpath, 'w+')

Nit: os.open(lockpath, os.O_WRONLY | os.O_APPEND | os.O_BINARY) is probably
better as it enforces that the lock was created by the parent process.

> +                os.dup2(logfilefp.fileno(), 1)
> +                os.dup2(logfilefp.fileno(), 2)
> +                logfilefp.close()

> @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init
>          try:
>              if not runargs:
>                  runargs = procutil.hgcmd() + pycompat.sysargv[1:]
> +                # Only hg processes accept --traceback (e.g., not dumbhttp.py)
> +                runargs.append('--traceback')

Doesn't it enable the traceback permanently? The --traceback option should be
specified explicitly only when needed.

>              runargs.append('--daemon-postexec=unlink:%s' % lockpath)
>              # Don't pass --cwd to the child process, because we've already
>              # changed directory.
> @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init
>                  return not os.path.exists(lockpath)
>              pid = procutil.rundetached(runargs, condfn)
>              if pid < 0:
> +                # If the daemonized process managed to write out an error msg,
> +                # report it.
> +                if os.path.exists(lockpath):
> +                    with open(lockpath) as log:
> +                        for line in log:
> +                            procutil.stderr.write('server: %s' % line)

Event without this hack, stdout/stderr of the child should be visible (at
least on Unix.) They aren't redirected until the server starts successfully.

> -        procutil.hidewindow()
> -        procutil.stdout.flush()
> -        procutil.stderr.flush()
> +
> +        if lockpath:
> +            procutil.hidewindow()
> +            procutil.stdout.flush()
> +            procutil.stderr.flush()
>  
> -        nullfd = os.open(os.devnull, os.O_RDWR)
> -        logfilefd = nullfd
> -        if logfile:
> -            logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT | os.O_APPEND,
> -                                0o666)
> -        os.dup2(nullfd, 0)
> -        os.dup2(logfilefd, 1)
> -        os.dup2(logfilefd, 2)
> -        if nullfd not in (0, 1, 2):
> -            os.close(nullfd)
> -        if logfile and logfilefd not in (0, 1, 2):
> -            os.close(logfilefd)
> +            nullfd = os.open(os.devnull, os.O_RDWR)
> +            logfilefd = nullfd
> +            if logfile:
> +                logfilefd = os.open(logfile,
> +                                    os.O_RDWR | os.O_CREAT | os.O_APPEND,
> +                                    0o666)
> +            os.dup2(nullfd, 0)
> +            os.dup2(logfilefd, 1)
> +            os.dup2(logfilefd, 2)
> +            if nullfd not in (0, 1, 2):
> +                os.close(nullfd)
> +            if logfile and logfilefd not in (0, 1, 2):
> +                os.close(logfilefd)
> +
> +            # Only unlink after redirecting stdout/stderr, so Windows doesn't
> +            # complain about a sharing violation.
> +            os.unlink(lockpath)

Wheter lockpath is specified or not, stdout/stderr of the child must be
redirected to /dev/null.
Matt Harbison - March 30, 2018, 3:04 a.m.
On Thu, 29 Mar 2018 07:26:04 -0400, Yuya Nishihara <yuya@tcha.org> wrote:

> On Wed, 28 Mar 2018 22:45:30 -0400, Matt Harbison wrote:
>> # HG changeset patch
>> # User Matt Harbison <matt_harbison@yahoo.com>
>> # Date 1522210269 14400
>> #      Wed Mar 28 00:11:09 2018 -0400
>> # Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc
>> # Parent  8bac14ce57781d2a45b6d036b3614b1a21917f0d
>> server: add an error feedback mechanism for when the daemon fails to  
>> launch
>>
>> There's a recurring problem on Windows where `hg serve -d` will  
>> randomly fail to
>> spawn a detached process.  The reason for the failure is completely  
>> hidden, and
>> it takes hours to get a single failure on my laptop.  All this does is  
>> redirect
>> stdout/stderr of the child to a file until the lock file is freed, and  
>> then the
>> parent dumps it out if it fails to spawn.
>
> Abusing the lockfile is fine, but ...
>
>> diff --git a/mercurial/server.py b/mercurial/server.py
>> --- a/mercurial/server.py
>> +++ b/mercurial/server.py
>> @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init
>>                 runargs=None, appendpid=False):
>>      '''Run a command as a service.'''
>>
>> +    # When daemonized, redirect stdout/stderr to the lockfile (which  
>> gets
>> +    # cleaned up after the child is up and running), so that the  
>> parent can read
>> +    # and print the error if this child dies early.
>> +    if opts['daemon_postexec']:
>> +        for inst in opts['daemon_postexec']:
>> +            if inst.startswith('unlink:'):
>> +                lockpath = inst[7:]
>> +                procutil.stdout.flush()
>> +                procutil.stderr.flush()
>> +                logfilefp = open(lockpath, 'w+')
>
> Nit: os.open(lockpath, os.O_WRONLY | os.O_APPEND | os.O_BINARY) is  
> probably
> better as it enforces that the lock was created by the parent process.
>
>> +                os.dup2(logfilefp.fileno(), 1)
>> +                os.dup2(logfilefp.fileno(), 2)
>> +                logfilefp.close()
>
>> @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init
>>          try:
>>              if not runargs:
>>                  runargs = procutil.hgcmd() + pycompat.sysargv[1:]
>> +                # Only hg processes accept --traceback (e.g., not  
>> dumbhttp.py)
>> +                runargs.append('--traceback')
>
> Doesn't it enable the traceback permanently? The --traceback option  
> should be
> specified explicitly only when needed.

Yes.  The errors happen in random locations, so I didn't want to sprinkle  
this on command lines all over.  I didn't see a way to detect if it was  
running under the test harness, but maybe it's OK to conditionalize this  
based on some environment variable that it sets?

>>              runargs.append('--daemon-postexec=unlink:%s' % lockpath)
>>              # Don't pass --cwd to the child process, because we've  
>> already
>>              # changed directory.
>> @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init
>>                  return not os.path.exists(lockpath)
>>              pid = procutil.rundetached(runargs, condfn)
>>              if pid < 0:
>> +                # If the daemonized process managed to write out an  
>> error msg,
>> +                # report it.
>> +                if os.path.exists(lockpath):
>> +                    with open(lockpath) as log:
>> +                        for line in log:
>> +                            procutil.stderr.write('server: %s' % line)
>
> Event without this hack, stdout/stderr of the child should be visible (at
> least on Unix.) They aren't redirected until the server starts  
> successfully.

Yeah, it looks like it's a Windows problem.  See 594dd384803c.  I wasn't  
sure if it's better to special case Windows, but I don't have a problem  
doing that.

>> -        procutil.hidewindow()
>> -        procutil.stdout.flush()
>> -        procutil.stderr.flush()
>> +
>> +        if lockpath:
>> +            procutil.hidewindow()
>> +            procutil.stdout.flush()
>> +            procutil.stderr.flush()
>>
>> -        nullfd = os.open(os.devnull, os.O_RDWR)
>> -        logfilefd = nullfd
>> -        if logfile:
>> -            logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT |  
>> os.O_APPEND,
>> -                                0o666)
>> -        os.dup2(nullfd, 0)
>> -        os.dup2(logfilefd, 1)
>> -        os.dup2(logfilefd, 2)
>> -        if nullfd not in (0, 1, 2):
>> -            os.close(nullfd)
>> -        if logfile and logfilefd not in (0, 1, 2):
>> -            os.close(logfilefd)
>> +            nullfd = os.open(os.devnull, os.O_RDWR)
>> +            logfilefd = nullfd
>> +            if logfile:
>> +                logfilefd = os.open(logfile,
>> +                                    os.O_RDWR | os.O_CREAT |  
>> os.O_APPEND,
>> +                                    0o666)
>> +            os.dup2(nullfd, 0)
>> +            os.dup2(logfilefd, 1)
>> +            os.dup2(logfilefd, 2)
>> +            if nullfd not in (0, 1, 2):
>> +                os.close(nullfd)
>> +            if logfile and logfilefd not in (0, 1, 2):
>> +                os.close(logfilefd)
>> +
>> +            # Only unlink after redirecting stdout/stderr, so Windows  
>> doesn't
>> +            # complain about a sharing violation.
>> +            os.unlink(lockpath)
>
> Wheter lockpath is specified or not, stdout/stderr of the child must be
> redirected to /dev/null.

That should be easy enough to fix.  I was under the impression that the  
lock file is mandatory.
Yuya Nishihara - March 30, 2018, 12:02 p.m.
On Thu, 29 Mar 2018 23:04:39 -0400, Matt Harbison wrote:
> On Thu, 29 Mar 2018 07:26:04 -0400, Yuya Nishihara <yuya@tcha.org> wrote:
> 
> > On Wed, 28 Mar 2018 22:45:30 -0400, Matt Harbison wrote:
> >> # HG changeset patch
> >> # User Matt Harbison <matt_harbison@yahoo.com>
> >> # Date 1522210269 14400
> >> #      Wed Mar 28 00:11:09 2018 -0400
> >> # Node ID 7a6eccc3b43dd8f6090b23aa38b2f71d33f5bbdc
> >> # Parent  8bac14ce57781d2a45b6d036b3614b1a21917f0d
> >> server: add an error feedback mechanism for when the daemon fails to  
> >> launch
> >>
> >> There's a recurring problem on Windows where `hg serve -d` will  
> >> randomly fail to
> >> spawn a detached process.  The reason for the failure is completely  
> >> hidden, and
> >> it takes hours to get a single failure on my laptop.  All this does is  
> >> redirect
> >> stdout/stderr of the child to a file until the lock file is freed, and  
> >> then the
> >> parent dumps it out if it fails to spawn.
> >
> > Abusing the lockfile is fine, but ...
> >
> >> diff --git a/mercurial/server.py b/mercurial/server.py
> >> --- a/mercurial/server.py
> >> +++ b/mercurial/server.py
> >> @@ -30,6 +30,20 @@ def runservice(opts, parentfn=None, init
> >>                 runargs=None, appendpid=False):
> >>      '''Run a command as a service.'''
> >>
> >> +    # When daemonized, redirect stdout/stderr to the lockfile (which  
> >> gets
> >> +    # cleaned up after the child is up and running), so that the  
> >> parent can read
> >> +    # and print the error if this child dies early.
> >> +    if opts['daemon_postexec']:
> >> +        for inst in opts['daemon_postexec']:
> >> +            if inst.startswith('unlink:'):
> >> +                lockpath = inst[7:]
> >> +                procutil.stdout.flush()
> >> +                procutil.stderr.flush()
> >> +                logfilefp = open(lockpath, 'w+')
> >
> > Nit: os.open(lockpath, os.O_WRONLY | os.O_APPEND | os.O_BINARY) is  
> > probably
> > better as it enforces that the lock was created by the parent process.
> >
> >> +                os.dup2(logfilefp.fileno(), 1)
> >> +                os.dup2(logfilefp.fileno(), 2)
> >> +                logfilefp.close()
> >
> >> @@ -47,6 +61,8 @@ def runservice(opts, parentfn=None, init
> >>          try:
> >>              if not runargs:
> >>                  runargs = procutil.hgcmd() + pycompat.sysargv[1:]
> >> +                # Only hg processes accept --traceback (e.g., not  
> >> dumbhttp.py)
> >> +                runargs.append('--traceback')
> >
> > Doesn't it enable the traceback permanently? The --traceback option  
> > should be
> > specified explicitly only when needed.
> 
> Yes.  The errors happen in random locations, so I didn't want to sprinkle  
> this on command lines all over.  I didn't see a way to detect if it was  
> running under the test harness, but maybe it's OK to conditionalize this  
> based on some environment variable that it sets?

If you want to enable --traceback temporarily, maybe you can add it to the
[defaults] or [alias].

  [defaults]
  serve = --traceback

Alternatively, we can add devel.* flag to enable the traceback only for
HTTP server, for example.

> >> @@ -61,6 +77,12 @@ def runservice(opts, parentfn=None, init
> >>                  return not os.path.exists(lockpath)
> >>              pid = procutil.rundetached(runargs, condfn)
> >>              if pid < 0:
> >> +                # If the daemonized process managed to write out an  
> >> error msg,
> >> +                # report it.
> >> +                if os.path.exists(lockpath):
> >> +                    with open(lockpath) as log:
> >> +                        for line in log:
> >> +                            procutil.stderr.write('server: %s' % line)
> >
> > Event without this hack, stdout/stderr of the child should be visible (at
> > least on Unix.) They aren't redirected until the server starts  
> > successfully.
> 
> Yeah, it looks like it's a Windows problem.  See 594dd384803c.  I wasn't  
> sure if it's better to special case Windows, but I don't have a problem  
> doing that.

I prefer enabling it conditionally with some inline comments.

> >> +            os.dup2(nullfd, 0)
> >> +            os.dup2(logfilefd, 1)
> >> +            os.dup2(logfilefd, 2)
> >> +            if nullfd not in (0, 1, 2):
> >> +                os.close(nullfd)
> >> +            if logfile and logfilefd not in (0, 1, 2):
> >> +                os.close(logfilefd)
> >> +
> >> +            # Only unlink after redirecting stdout/stderr, so Windows  
> >> doesn't
> >> +            # complain about a sharing violation.
> >> +            os.unlink(lockpath)
> >
> > Wheter lockpath is specified or not, stdout/stderr of the child must be
> > redirected to /dev/null.
> 
> That should be easy enough to fix.  I was under the impression that the  
> lock file is mandatory.

Null redirection is for the daemonized processes, not for the lock file. And
IIRC, chg spanws a daemon without the unlink:<path> option.

Patch

diff --git a/mercurial/server.py b/mercurial/server.py
--- a/mercurial/server.py
+++ b/mercurial/server.py
@@ -30,6 +30,20 @@  def runservice(opts, parentfn=None, init
                runargs=None, appendpid=False):
     '''Run a command as a service.'''
 
+    # When daemonized, redirect stdout/stderr to the lockfile (which gets
+    # cleaned up after the child is up and running), so that the parent can read
+    # and print the error if this child dies early.
+    if opts['daemon_postexec']:
+        for inst in opts['daemon_postexec']:
+            if inst.startswith('unlink:'):
+                lockpath = inst[7:]
+                procutil.stdout.flush()
+                procutil.stderr.flush()
+                logfilefp = open(lockpath, 'w+')
+                os.dup2(logfilefp.fileno(), 1)
+                os.dup2(logfilefp.fileno(), 2)
+                logfilefp.close()
+
     def writepid(pid):
         if opts['pid_file']:
             if appendpid:
@@ -47,6 +61,8 @@  def runservice(opts, parentfn=None, init
         try:
             if not runargs:
                 runargs = procutil.hgcmd() + pycompat.sysargv[1:]
+                # Only hg processes accept --traceback (e.g., not dumbhttp.py)
+                runargs.append('--traceback')
             runargs.append('--daemon-postexec=unlink:%s' % lockpath)
             # Don't pass --cwd to the child process, because we've already
             # changed directory.
@@ -61,6 +77,12 @@  def runservice(opts, parentfn=None, init
                 return not os.path.exists(lockpath)
             pid = procutil.rundetached(runargs, condfn)
             if pid < 0:
+                # If the daemonized process managed to write out an error msg,
+                # report it.
+                if os.path.exists(lockpath):
+                    with open(lockpath) as log:
+                        for line in log:
+                            procutil.stderr.write('server: %s' % line)
                 raise error.Abort(_('child process failed to start'))
             writepid(pid)
         finally:
@@ -81,31 +103,39 @@  def runservice(opts, parentfn=None, init
             os.setsid()
         except AttributeError:
             pass
+
+        lockpath = None
         for inst in opts['daemon_postexec']:
             if inst.startswith('unlink:'):
                 lockpath = inst[7:]
-                os.unlink(lockpath)
             elif inst.startswith('chdir:'):
                 os.chdir(inst[6:])
             elif inst != 'none':
                 raise error.Abort(_('invalid value for --daemon-postexec: %s')
                                   % inst)
-        procutil.hidewindow()
-        procutil.stdout.flush()
-        procutil.stderr.flush()
+
+        if lockpath:
+            procutil.hidewindow()
+            procutil.stdout.flush()
+            procutil.stderr.flush()
 
-        nullfd = os.open(os.devnull, os.O_RDWR)
-        logfilefd = nullfd
-        if logfile:
-            logfilefd = os.open(logfile, os.O_RDWR | os.O_CREAT | os.O_APPEND,
-                                0o666)
-        os.dup2(nullfd, 0)
-        os.dup2(logfilefd, 1)
-        os.dup2(logfilefd, 2)
-        if nullfd not in (0, 1, 2):
-            os.close(nullfd)
-        if logfile and logfilefd not in (0, 1, 2):
-            os.close(logfilefd)
+            nullfd = os.open(os.devnull, os.O_RDWR)
+            logfilefd = nullfd
+            if logfile:
+                logfilefd = os.open(logfile,
+                                    os.O_RDWR | os.O_CREAT | os.O_APPEND,
+                                    0o666)
+            os.dup2(nullfd, 0)
+            os.dup2(logfilefd, 1)
+            os.dup2(logfilefd, 2)
+            if nullfd not in (0, 1, 2):
+                os.close(nullfd)
+            if logfile and logfilefd not in (0, 1, 2):
+                os.close(logfilefd)
+
+            # Only unlink after redirecting stdout/stderr, so Windows doesn't
+            # complain about a sharing violation.
+            os.unlink(lockpath)
 
     if runfn:
         return runfn()
diff --git a/tests/test-serve.t b/tests/test-serve.t
--- a/tests/test-serve.t
+++ b/tests/test-serve.t
@@ -47,8 +47,8 @@  With -v and -p daytime (should fail beca
 
 #if no-root no-windows
   $ KILLQUIETLY=Y
-  $ hgserve -p daytime
-  abort: cannot start server at 'localhost:13': Permission denied
+  $ hgserve -p daytime 2>&1 | egrep -v '^server: (Traceback|Abort|  )'
+  server: abort: cannot start server at 'localhost:13': Permission denied
   abort: child process failed to start
   % errors
   $ KILLQUIETLY=N