Patchwork D2734: hgweb: parse WSGI request into a data structure

login
register
mail settings
Submitter phabricator
Date March 9, 2018, 7:29 p.m.
Message ID <7d080e298bc067e45fde898953b5f016@localhost.localdomain>
Download mbox | patch
Permalink /patch/29192/
State Not Applicable
Headers show

Comments

phabricator - March 9, 2018, 7:29 p.m.
This revision was automatically updated to reflect the committed changes.
Closed by commit rHG3e13a21ce6b5: hgweb: parse WSGI request into a data structure (authored by indygreg, committed by ).

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D2734?vs=6740&id=6778

REVISION DETAIL
  https://phab.mercurial-scm.org/D2734

AFFECTED FILES
  mercurial/hgweb/hgweb_mod.py
  mercurial/hgweb/hgwebdir_mod.py
  mercurial/hgweb/request.py

CHANGE DETAILS




To: indygreg, #hg-reviewers, durin42
Cc: mercurial-devel

Patch

diff --git a/mercurial/hgweb/request.py b/mercurial/hgweb/request.py
--- a/mercurial/hgweb/request.py
+++ b/mercurial/hgweb/request.py
@@ -11,13 +11,17 @@ 
 import cgi
 import errno
 import socket
+#import wsgiref.validate
 
 from .common import (
     ErrorResponse,
     HTTP_NOT_MODIFIED,
     statusmessage,
 )
 
+from ..thirdparty import (
+    attr,
+)
 from .. import (
     pycompat,
     util,
@@ -54,6 +58,124 @@ 
             pycompat.bytesurl(i.strip()) for i in v]
     return bytesform
 
+@attr.s(frozen=True)
+class parsedrequest(object):
+    """Represents a parsed WSGI request / static HTTP request parameters."""
+
+    # Full URL for this request.
+    url = attr.ib()
+    # URL without any path components. Just <proto>://<host><port>.
+    baseurl = attr.ib()
+    # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
+    # of HTTP: Host header for hostname. This is likely what clients used.
+    advertisedurl = attr.ib()
+    advertisedbaseurl = attr.ib()
+    # WSGI application path.
+    apppath = attr.ib()
+    # List of path parts to be used for dispatch.
+    dispatchparts = attr.ib()
+    # URL path component (no query string) used for dispatch.
+    dispatchpath = attr.ib()
+    # Raw query string (part after "?" in URL).
+    querystring = attr.ib()
+
+def parserequestfromenv(env):
+    """Parse URL components from environment variables.
+
+    WSGI defines request attributes via environment variables. This function
+    parses the environment variables into a data structure.
+    """
+    # PEP-0333 defines the WSGI spec and is a useful reference for this code.
+
+    # We first validate that the incoming object conforms with the WSGI spec.
+    # We only want to be dealing with spec-conforming WSGI implementations.
+    # TODO enable this once we fix internal violations.
+    #wsgiref.validate.check_environ(env)
+
+    # PEP-0333 states that environment keys and values are native strings
+    # (bytes on Python 2 and str on Python 3). The code points for the Unicode
+    # strings on Python 3 must be between \00000-\000FF. We deal with bytes
+    # in Mercurial, so mass convert string keys and values to bytes.
+    if pycompat.ispy3:
+        env = {k.encode('latin-1'): v for k, v in env.iteritems()}
+        env = {k: v.encode('latin-1') if isinstance(v, str) else v
+               for k, v in env.iteritems()}
+
+    # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
+    # the environment variables.
+    # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
+    # how URLs are reconstructed.
+    fullurl = env['wsgi.url_scheme'] + '://'
+    advertisedfullurl = fullurl
+
+    def addport(s):
+        if env['wsgi.url_scheme'] == 'https':
+            if env['SERVER_PORT'] != '443':
+                s += ':' + env['SERVER_PORT']
+        else:
+            if env['SERVER_PORT'] != '80':
+                s += ':' + env['SERVER_PORT']
+
+        return s
+
+    if env.get('HTTP_HOST'):
+        fullurl += env['HTTP_HOST']
+    else:
+        fullurl += env['SERVER_NAME']
+        addport(fullurl)
+
+    advertisedfullurl += env['SERVER_NAME']
+    advertisedfullurl = addport(advertisedfullurl)
+
+    baseurl = fullurl
+    advertisedbaseurl = advertisedfullurl
+
+    fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
+    advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
+    fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
+    advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
+
+    if env.get('QUERY_STRING'):
+        fullurl += '?' + env['QUERY_STRING']
+        advertisedfullurl += '?' + env['QUERY_STRING']
+
+    # When dispatching requests, we look at the URL components (PATH_INFO
+    # and QUERY_STRING) after the application root (SCRIPT_NAME). But hgwebdir
+    # has the concept of "virtual" repositories. This is defined via REPO_NAME.
+    # If REPO_NAME is defined, we append it to SCRIPT_NAME to form a new app
+    # root. We also exclude its path components from PATH_INFO when resolving
+    # the dispatch path.
+
+    # TODO the use of trailing slashes in apppath is arguably wrong. We need it
+    # to appease low-level parts of hgweb_mod for now.
+    apppath = env['SCRIPT_NAME']
+    if not apppath.endswith('/'):
+        apppath += '/'
+
+    if env.get('REPO_NAME'):
+        apppath += env.get('REPO_NAME') + '/'
+
+    if 'PATH_INFO' in env:
+        dispatchparts = env['PATH_INFO'].strip('/').split('/')
+
+        # Strip out repo parts.
+        repoparts = env.get('REPO_NAME', '').split('/')
+        if dispatchparts[:len(repoparts)] == repoparts:
+            dispatchparts = dispatchparts[len(repoparts):]
+    else:
+        dispatchparts = []
+
+    dispatchpath = '/'.join(dispatchparts)
+
+    querystring = env.get('QUERY_STRING', '')
+
+    return parsedrequest(url=fullurl, baseurl=baseurl,
+                         advertisedurl=advertisedfullurl,
+                         advertisedbaseurl=advertisedbaseurl,
+                         apppath=apppath,
+                         dispatchparts=dispatchparts, dispatchpath=dispatchpath,
+                         querystring=querystring)
+
 class wsgirequest(object):
     """Higher-level API for a WSGI request.
 
diff --git a/mercurial/hgweb/hgwebdir_mod.py b/mercurial/hgweb/hgwebdir_mod.py
--- a/mercurial/hgweb/hgwebdir_mod.py
+++ b/mercurial/hgweb/hgwebdir_mod.py
@@ -229,6 +229,8 @@ 
                 yield r
 
     def _runwsgi(self, wsgireq):
+        req = requestmod.parserequestfromenv(wsgireq.env)
+
         try:
             self.refresh()
 
diff --git a/mercurial/hgweb/hgweb_mod.py b/mercurial/hgweb/hgweb_mod.py
--- a/mercurial/hgweb/hgweb_mod.py
+++ b/mercurial/hgweb/hgweb_mod.py
@@ -316,6 +316,7 @@ 
                     yield r
 
     def _runwsgi(self, wsgireq, repo):
+        req = requestmod.parserequestfromenv(wsgireq.env)
         rctx = requestcontext(self, repo)
 
         # This state is global across all threads.
@@ -329,14 +330,7 @@ 
                                if h[0] != 'Content-Security-Policy']
             wsgireq.headers.append(('Content-Security-Policy', rctx.csp))
 
-        # work with CGI variables to create coherent structure
-        # use SCRIPT_NAME, PATH_INFO and QUERY_STRING as well as our REPO_NAME
-
-        wsgireq.url = wsgireq.env[r'SCRIPT_NAME']
-        if not wsgireq.url.endswith(r'/'):
-            wsgireq.url += r'/'
-        if wsgireq.env.get('REPO_NAME'):
-            wsgireq.url += wsgireq.env[r'REPO_NAME'] + r'/'
+        wsgireq.url = pycompat.sysstr(req.apppath)
 
         if r'PATH_INFO' in wsgireq.env:
             parts = wsgireq.env[r'PATH_INFO'].strip(r'/').split(r'/')