tools/bug_tool/ClientCookie/_HeadersUtil.py

   1 """HTTP header value parsing utility functions.
   2
   3 from ClientCookie._HeadersUtil import split_header_words
   4 values = split_header_words(h.headers["Content-Type"])
   5
   6 This module provides a few functions that help parsing and construction of
   7 valid HTTP header values.
   8
   9
  10 Copyright 1997-1998, Gisle Aas
  11 Copyright 2002-2003, John J. Lee
  12
  13 This code is free software; you can redistribute it and/or modify it under
  14 the terms of the BSD License (see the file COPYING included with the
  15 distribution).
  16
  17 """
  18
  19 import re, string
  20 from types import StringType
  21 try:
  22     from types import UnicodeType
  23     STRING_TYPES = StringType, UnicodeType
  24 except:
  25     STRING_TYPES = StringType,
  26
  27 from _Util import startswith, endswith, http2time
  28
  29 try: True
  30 except NameError:
  31     True = 1
  32     False = 0
  33
  34 def unmatched(match):
  35     """Return unmatched part of re.Match object."""
  36     start, end = match.span(0)
  37     return match.string[:start]+match.string[end:]
  38
  39 # XXX I really can't see what this =* was for (came from LWP, I guess)
  40 #token_re =        re.compile(r"^\s*(=*[^\s=;,]+)")
  41 token_re =        re.compile(r"^\s*([^=\s;,]+)")
  42 quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
  43 value_re =        re.compile(r"^\s*=\s*([^\s;,]*)")
  44 escape_re = re.compile(r"\\(.)")
  45 def split_header_words(header_values):
  46     r"""Parse header values into a list of lists containing key,value pairs.
  47
  48     The function knows how to deal with ",", ";" and "=" as well as quoted
  49     values after "=".  A list of space separated tokens are parsed as if they
  50     were separated by ";".
  51
  52     If the header_values passed as argument contains multiple values, then they
  53     are treated as if they were a single value separated by comma ",".
  54
  55     This means that this function is useful for parsing header fields that
  56     follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
  57     the requirement for tokens).
  58
  59       headers           = #header
  60       header            = (token | parameter) *( [";"] (token | parameter))
  61
  62       token             = 1*<any CHAR except CTLs or separators>
  63       separators        = "(" | ")" | "<" | ">" | "@"
  64                         | "," | ";" | ":" | "\" | <">
  65                         | "/" | "[" | "]" | "?" | "="
  66                         | "{" | "}" | SP | HT
  67
  68       quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
  69       qdtext            = <any TEXT except <">>
  70       quoted-pair       = "\" CHAR
  71
  72       parameter         = attribute "=" value
  73       attribute         = token
  74       value             = token | quoted-string
  75
  76     Each header is represented by a list of key/value pairs.  The value for a
  77     simple token (not part of a parameter) is None.  Syntactically incorrect
  78     headers will not necessarily be parsed as you would want.
  79
  80     This is easier to describe with some examples:
  81
  82     >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
  83     [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
  84     >>> split_header_words(['text/html; charset="iso-8859-1"'])
  85     [[('text/html', None), ('charset', 'iso-8859-1')]]
  86     >>> split_header_words([r'Basic realm="\"foo\bar\""'])
  87     [[('Basic', None), ('realm', '"foobar"')]]
  88
  89     """
  90     assert type(header_values) not in STRING_TYPES
  91     result = []
  92     for text in header_values:
  93         orig_text = text
  94         pairs = []
  95         while text:
  96             m = token_re.search(text)
  97             if m:
  98                 text = unmatched(m)
  99                 name = m.group(1)
 100                 m = quoted_value_re.search(text)
 101                 if m:  # quoted value
 102                     text = unmatched(m)
 103                     value = m.group(1)
 104                     value = escape_re.sub(r"\1", value)
 105                 else:
 106                     m = value_re.search(text)
 107                     if m:  # unquoted value
 108                         text = unmatched(m)
 109                         value = m.group(1)
 110                         value = string.rstrip(value)
 111                     else:
 112                         # no value, a lone token
 113                         value = None
 114                 pairs.append((name, value))
 115             elif startswith(string.lstrip(text), ","):
 116                 # concatenated headers, as per RFC 2616 section 4.2
 117                 text = string.lstrip(text)[1:]
 118                 if pairs: result.append(pairs)
 119                 pairs = []
 120             else:
 121                 # skip junk
 122                 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
 123                 assert nr_junk_chars > 0, (
 124                     "split_header_words bug: '%s', '%s', %s" %
 125                     (orig_text, text, pairs))
 126                 text = non_junk
 127         if pairs: result.append(pairs)
 128     return result
 129
 130 join_escape_re = re.compile(r"([\"\\])")
 131 def join_header_words(lists):
 132     """Do the inverse of the conversion done by split_header_words.
 133
 134     Takes a list of lists of (key, value) pairs and produces a single header
 135     value.  Attribute values are quoted if needed.
 136
 137     >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
 138     'text/plain; charset="iso-8859/1"'
 139     >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
 140     'text/plain, charset="iso-8859/1"'
 141
 142     """
 143     headers = []
 144     for pairs in lists:
 145         attr = []
 146         for k, v in pairs:
 147             if v is not None:
 148                 if not re.search(r"^\w+$", v):
 149                     v = join_escape_re.sub(r"\\\1", v)  # escape " and \
 150                     v = '"%s"' % v
 151                 if k is None:  # Netscape cookies may have no name
 152                     k = v
 153                 else:
 154                     k = "%s=%s" % (k, v)
 155             attr.append(k)
 156         if attr: headers.append(string.join(attr, "; "))
 157     return string.join(headers, ", ")
 158
 159 def parse_ns_headers(ns_headers):
 160     """Ad-hoc parser for Netscape protocol cookie-attributes.
 161
 162     The old Netscape cookie format for Set-Cookie can for instance contain
 163     an unquoted "," in the expires field, so we have to use this ad-hoc
 164     parser instead of split_header_words.
 165
 166     XXX This may not make the best possible effort to parse all the crap
 167     that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
 168     parser is probably better, so could do worse than following that if
 169     this ever gives any trouble.
 170
 171     Currently, this is also used for parsing RFC 2109 cookies.
 172
 173     """
 174     known_attrs = ("expires", "domain", "path", "secure",
 175                    # RFC 2109 attrs (may turn up in Netscape cookies, too)
 176                    "port", "max-age")
 177
 178     result = []
 179     for ns_header in ns_headers:
 180         pairs = []
 181         version_set = False
 182         for param in re.split(r";\s*", ns_header):
 183             param = string.rstrip(param)
 184             if param == "": continue
 185             if "=" not in param:
 186                 if string.lower(param) in known_attrs:
 187                     k, v = param, None
 188                 else:
 189                     # cookie with missing name
 190                     k, v = None, param
 191             else:
 192                 k, v = re.split(r"\s*=\s*", param, 1)
 193                 k = string.lstrip(k)
 194             if k is not None:
 195                 lc = string.lower(k)
 196                 if lc in known_attrs:
 197                     k = lc
 198                 if k == "version":
 199                     # This is an RFC 2109 cookie.  Will be treated as RFC 2965
 200                     # cookie in rest of code.
 201                     # Probably it should be parsed with split_header_words, but
 202                     # that's too much hassle.
 203                     version_set = True
 204                 if k == "expires":
 205                     # convert expires date to seconds since epoch
 206                     if startswith(v, '"'): v = v[1:]
 207                     if endswith(v, '"'): v = v[:-1]
 208                     v = http2time(v)  # None if invalid
 209             pairs.append((k, v))
 210
 211         if pairs:
 212             if not version_set:
 213                 pairs.append(("version", "0"))
 214             result.append(pairs)
 215
 216     return result
 217
 218
 219 def _test():
 220    import doctest, _HeadersUtil
 221    return doctest.testmod(_HeadersUtil)
 222
 223 if __name__ == "__main__":
 224    _test()