Initial revision
[ardour.git] / tools / bug_tool / ClientCookie / _urllib2_support.py
1 """Integration with Python standard library module urllib2.
2
3 Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
4 the META HTTP-EQUIV tag contents, and following Refresh header redirects.
5
6 Copyright 2002-2003 John J Lee <jjl@pobox.com>
7
8 This code is free software; you can redistribute it and/or modify it under
9 the terms of the BSD License (see the file COPYING included with the
10 distribution).
11
12 """
13
14 import copy, time
15
16 import ClientCookie
17 from _ClientCookie import CookieJar, request_host
18 from _Util import isstringlike
19 from _Debug import _debug
20
21 try: True
22 except NameError:
23     True = 1
24     False = 0
25
26 CHUNK = 1024  # size of chunks fed to HTML HEAD parser, in bytes
27
28 try:
29     from urllib2 import AbstractHTTPHandler
30 except ImportError:
31     pass
32 else:
33     import urlparse, urllib2, urllib, httplib, htmllib, formatter, string
34     from urllib2 import URLError, HTTPError
35     import types, string, socket
36     from cStringIO import StringIO
37     from _Util import seek_wrapper
38     try:
39         import threading
40         _threading = threading; del threading
41     except ImportError:
42         import dummy_threading
43         _threading = dummy_threading; del dummy_threading
44
45     # This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
46     #  (http://www.python.org/sf/549151)
47     # 2.2.3 is broken here (my fault!), 2.3 is fixed.
48     class HTTPRedirectHandler(urllib2.BaseHandler):
49         # maximum number of redirections before assuming we're in a loop
50         max_redirections = 10
51
52         # Implementation notes:
53
54         # To avoid the server sending us into an infinite loop, the request
55         # object needs to track what URLs we have already seen.  Do this by
56         # adding a handler-specific attribute to the Request object.  The value
57         # of the dict is used to count the number of times the same url has
58         # been visited.  This is needed because this isn't necessarily a loop:
59         # there is more than one way to redirect (Refresh, 302, 303, 307).
60
61         # Another handler-specific Request attribute, original_url, is used to
62         # remember the URL of the original request so that it is possible to
63         # decide whether or not RFC 2965 cookies should be turned on during
64         # redirect.
65
66         # Always unhandled redirection codes:
67         # 300 Multiple Choices: should not handle this here.
68         # 304 Not Modified: no need to handle here: only of interest to caches
69         #     that do conditional GETs
70         # 305 Use Proxy: probably not worth dealing with here
71         # 306 Unused: what was this for in the previous versions of protocol??
72
73         def redirect_request(self, newurl, req, fp, code, msg, headers):
74             """Return a Request or None in response to a redirect.
75
76             This is called by the http_error_30x methods when a redirection
77             response is received.  If a redirection should take place, return a
78             new Request to allow http_error_30x to perform the redirect;
79             otherwise, return None to indicate that an HTTPError should be
80             raised.
81
82             """
83             if code in (301, 302, 303) or (code == 307 and not req.has_data()):
84                 # Strictly (according to RFC 2616), 301 or 302 in response to
85                 # a POST MUST NOT cause a redirection without confirmation
86                 # from the user (of urllib2, in this case).  In practice,
87                 # essentially all clients do redirect in this case, so we do
88                 # the same.
89                 return Request(newurl, headers=req.headers)
90             else:
91                 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
92
93         def http_error_302(self, req, fp, code, msg, headers):
94             if headers.has_key('location'):
95                 newurl = headers['location']
96             elif headers.has_key('uri'):
97                 newurl = headers['uri']
98             else:
99                 return
100             newurl = urlparse.urljoin(req.get_full_url(), newurl)
101
102             # XXX Probably want to forget about the state of the current
103             # request, although that might interact poorly with other
104             # handlers that also use handler-specific request attributes
105             new = self.redirect_request(newurl, req, fp, code, msg, headers)
106             if new is None:
107                 return
108
109             # remember where we started from
110             if hasattr(req, "original_url"):
111                 new.original_url = req.original_url
112             else:
113                 new.original_url = req.get_full_url()
114
115             # loop detection
116             # .error_302_dict[(url, code)] is number of times url
117             # previously visited as a result of a redirection with this
118             # code (error_30x_dict would be a better name).
119             new.origin_req_host = req.origin_req_host
120             if not hasattr(req, 'error_302_dict'):
121                 new.error_302_dict = req.error_302_dict = {(newurl, code): 1}
122             else:
123                 ed = new.error_302_dict = req.error_302_dict
124                 nr_visits = ed.get((newurl, code), 0)
125                 # Refreshes generate fake 302s, so we can hit the same URL as
126                 # a result of the same redirection code twice without
127                 # necessarily being in a loop!  So, allow two visits to each
128                 # URL as a result of each redirection code.
129                 if len(ed) < self.max_redirections and nr_visits < 2:
130                     ed[(newurl, code)] = nr_visits + 1
131                 else:
132                     raise HTTPError(req.get_full_url(), code,
133                                     self.inf_msg + msg, headers, fp)
134
135             if ClientCookie.REDIRECT_DEBUG:
136                 _debug("redirecting to %s", newurl)
137
138             # Don't close the fp until we are sure that we won't use it
139             # with HTTPError.  
140             fp.read()
141             fp.close()
142
143             return self.parent.open(new)
144
145         http_error_301 = http_error_303 = http_error_307 = http_error_302
146
147         inf_msg = "The HTTP server returned a redirect error that would " \
148                   "lead to an infinite loop.\n" \
149                   "The last 30x error message was:\n"
150
151
152     class Request(urllib2.Request):
153         def __init__(self, url, data=None, headers={}):
154             urllib2.Request.__init__(self, url, data, headers)
155             self.unredirected_hdrs = {}
156
157         def add_unredirected_header(self, key, val):
158             # these headers do not persist from one request to the next in a chain
159             # of requests
160             self.unredirected_hdrs[string.capitalize(key)] = val
161
162         def has_key(self, header_name):
163             if (self.headers.has_key(header_name) or
164                 self.unredirected_hdrs.has_key(header_name)):
165                 return True
166             return False
167
168         def get(self, header_name, failobj=None):
169             if self.headers.has_key(header_name):
170                 return self.headers[header_name]
171             if self.unredirected_headers.has_key(header_name):
172                 return self.unredirected_headers[header_name]
173             return failobj
174
175
176     class BaseProcessor:
177         processor_order = 500
178
179         def add_parent(self, parent):
180             self.parent = parent
181         def close(self):
182             self.parent = None
183         def __lt__(self, other):
184             if not hasattr(other, "processor_order"):
185                 return True
186             return self.processor_order < other.processor_order
187
188     class HTTPRequestUpgradeProcessor(BaseProcessor):
189         # upgrade Request to class with support for headers that don't get
190         # redirected
191         processor_order = 0  # before anything else
192
193         def http_request(self, request):
194             if not hasattr(request, "add_unredirected_header"):
195                 request = Request(request._Request__original, request.data,
196                                   request.headers)
197             return request
198
199         https_request = http_request
200
201     class HTTPEquivProcessor(BaseProcessor):
202         """Append META HTTP-EQUIV headers to regular HTTP headers."""
203         def http_response(self, request, response):
204             if not hasattr(response, "seek"):
205                 response = seek_wrapper(response)
206             # grab HTTP-EQUIV headers and add them to the true HTTP headers
207             headers = response.info()
208             for hdr, val in parse_head(response):
209                 headers[hdr] = val
210             response.seek(0)
211             return response
212
213         https_response = http_response
214
215     # XXX ATM this only takes notice of http responses -- probably
216     #   should be independent of protocol scheme (http, ftp, etc.)
217     class SeekableProcessor(BaseProcessor):
218         """Make responses seekable."""
219
220         def http_response(self, request, response):
221             if not hasattr(response, "seek"):
222                 return seek_wrapper(response)
223             return response
224
225         https_response = http_response
226
227     # XXX if this gets added to urllib2, unverifiable would end up as an
228     #   attribute on Request.
229     class HTTPCookieProcessor(BaseProcessor):
230         """Handle HTTP cookies."""
231         def __init__(self, cookies=None):
232             if cookies is None:
233                 cookies = CookieJar()
234             self.cookies = cookies
235
236         def _unverifiable(self, request):
237             if hasattr(request, "error_302_dict") and request.error_302_dict:
238                 redirect = True
239             else:
240                 redirect = False
241             if (redirect or
242                 (hasattr(request, "unverifiable") and request.unverifiable)):
243                 unverifiable = True
244             else:
245                 unverifiable = False
246             return unverifiable
247
248         def http_request(self, request):
249             unverifiable = self._unverifiable(request)
250             if not unverifiable:
251                 # Stuff request-host of this origin transaction into Request
252                 # object, because we need to know it to know whether cookies
253                 # should be in operation during derived requests (redirects,
254                 # specifically -- including refreshes).
255                 request.origin_req_host = request_host(request)
256             self.cookies.add_cookie_header(request, unverifiable)
257             return request
258
259         def http_response(self, request, response): 
260             unverifiable = self._unverifiable(request)
261             self.cookies.extract_cookies(response, request, unverifiable)
262             return response
263
264         https_request = http_request
265         https_response = http_response
266
267     class HTTPRefererProcessor(BaseProcessor):
268         """Add Referer header to requests.
269
270         This only makes sense if you use each RefererProcessor for a single
271         chain of requests only (so, for example, if you use a single
272         HTTPRefererProcessor to fetch a series of URLs extracted from a single
273         page, this will break).
274
275         """
276         def __init__(self):
277             self.referer = None
278
279         def http_request(self, request):
280             if ((self.referer is not None) and
281                 not request.has_key("Referer")):
282                 request.add_unredirected_header("Referer", self.referer)
283             return request
284
285         def http_response(self, request, response):
286             self.referer = response.geturl()
287             return response
288
289         https_request = http_request
290         https_response = http_response
291
292     class HTTPStandardHeadersProcessor(BaseProcessor):
293         def http_request(self, request):
294             host = request.get_host()
295             if not host:
296                 raise URLError('no host given')
297
298             if request.has_data():  # POST
299                 data = request.get_data()
300                 if not request.has_key('Content-type'):
301                     request.add_unredirected_header(
302                         'Content-type',
303                         'application/x-www-form-urlencoded')
304                 if not request.has_key('Content-length'):
305                     request.add_unredirected_header(
306                         'Content-length', '%d' % len(data))
307
308             scheme, sel = urllib.splittype(request.get_selector())
309             sel_host, sel_path = urllib.splithost(sel)
310             if not request.has_key('Host'):
311                 request.add_unredirected_header('Host', sel_host or host)
312             for name, value in self.parent.addheaders:
313                 name = string.capitalize(name)
314                 if not request.has_key(name):
315                     request.add_unredirected_header(name, value)
316
317             return request
318
319         https_request = http_request
320
321     class HTTPResponseDebugProcessor(BaseProcessor):
322         processor_order = 900  # before redirections, after everything else
323
324         def http_response(self, request, response):
325             if not hasattr(response, "seek"):
326                 response = seek_wrapper(response)
327             _debug(response.read())
328             _debug("*****************************************************")
329             response.seek(0)
330             return response
331
332         https_response = http_response
333
334     class HTTPRefreshProcessor(BaseProcessor):
335         """Perform HTTP Refresh redirections.
336
337         Note that if a non-200 HTTP code has occurred (for example, a 30x
338         redirect), this processor will do nothing.
339
340         By default, only zero-time Refresh headers are redirected.  Use the
341         max_time constructor argument to allow Refresh with longer pauses.
342         Use the honor_time argument to control whether the requested pause
343         is honoured (with a time.sleep()) or skipped in favour of immediate
344         redirection.
345
346         """
347         processor_order = 1000
348
349         def __init__(self, max_time=0, honor_time=True):
350             self.max_time = max_time
351             self.honor_time = honor_time
352
353         def http_response(self, request, response):
354             code, msg, hdrs = response.code, response.msg, response.info()
355
356             if code == 200 and hdrs.has_key("refresh"):
357                 refresh = hdrs["refresh"]
358                 i = string.find(refresh, ";")
359                 if i != -1:
360                     pause, newurl_spec = refresh[:i], refresh[i+1:]
361                     i = string.find(newurl_spec, "=")
362                     if i != -1:
363                         pause = int(pause)
364                         if pause <= self.max_time:
365                             if pause != 0 and self.honor_time:
366                                 time.sleep(pause)
367                             newurl = newurl_spec[i+1:]
368                             # fake a 302 response
369                             hdrs["location"] = newurl
370                             response = self.parent.error(
371                                 'http', request, response, 302, msg, hdrs)
372
373             return response
374
375         https_response = http_response
376
377     class HTTPErrorProcessor(BaseProcessor):
378         """Process non-200 HTTP error responses.
379
380         This just passes the job on to the Handler.<proto>_error_<code>
381         methods, via the OpenerDirector.error method.
382
383         """
384         processor_order = 1000
385
386         def http_response(self, request, response):
387             code, msg, hdrs = response.code, response.msg, response.info()
388
389             if code != 200:
390                 response = self.parent.error(
391                     'http', request, response, code, msg, hdrs)
392
393             return response
394
395         https_response = http_response
396
397
398     class OpenerDirector(urllib2.OpenerDirector):
399         # XXX might be useful to have remove_processor, too (say you want to
400         #   set a new RefererProcessor, but keep the old CookieProcessor --
401         #   could always just create everything anew, though (using old
402         #   CookieJar object to create CookieProcessor)
403         def __init__(self):
404             urllib2.OpenerDirector.__init__(self)
405             #self.processors = []
406             self.process_response = {}
407             self.process_request = {}
408
409         def add_handler(self, handler):
410             # XXX
411             # tidy me
412             # the same handler could be added twice without detection
413             added = 0
414             for meth in dir(handler.__class__):
415                 if meth[-5:] == '_open':
416                     protocol = meth[:-5]
417                     if self.handle_open.has_key(protocol):
418                         self.handle_open[protocol].append(handler)
419                         self.handle_open[protocol].sort()
420                     else:
421                         self.handle_open[protocol] = [handler]
422                     added = 1
423                     continue
424                 i = string.find(meth, '_')
425                 j = string.find(meth[i+1:], '_') + i + 1
426                 if j != -1 and meth[i+1:j] == 'error':
427                     proto = meth[:i]
428                     kind = meth[j+1:]
429                     try:
430                         kind = int(kind)
431                     except ValueError:
432                         pass
433                     dict = self.handle_error.get(proto, {})
434                     if dict.has_key(kind):
435                         dict[kind].append(handler)
436                         dict[kind].sort()
437                     else:
438                         dict[kind] = [handler]
439                     self.handle_error[proto] = dict
440                     added = 1
441                     continue
442                 if meth[-9:] == "_response":
443                     protocol = meth[:-9]
444                     if self.process_response.has_key(protocol):
445                         self.process_response[protocol].append(handler)
446                         self.process_response[protocol].sort()
447                     else:
448                         self.process_response[protocol] = [handler]
449                     added = True
450                     continue
451                 elif meth[-8:] == "_request":
452                     protocol = meth[:-8]
453                     if self.process_request.has_key(protocol):
454                         self.process_request[protocol].append(handler)
455                         self.process_request[protocol].sort()
456                     else:
457                         self.process_request[protocol] = [handler]
458                     added = True
459                     continue
460             if added:
461                 self.handlers.append(handler)
462                 self.handlers.sort()
463                 handler.add_parent(self)
464
465 ##         def add_processor(self, processor):
466 ##             added = False
467 ##             for meth in dir(processor):
468 ##                 if meth[-9:] == "_response":
469 ##                     protocol = meth[:-9]
470 ##                     if self.process_response.has_key(protocol):
471 ##                         self.process_response[protocol].append(processor)
472 ##                         self.process_response[protocol].sort()
473 ##                     else:
474 ##                         self.process_response[protocol] = [processor]
475 ##                     added = True
476 ##                     continue
477 ##                 elif meth[-8:] == "_request":
478 ##                     protocol = meth[:-8]
479 ##                     if self.process_request.has_key(protocol):
480 ##                         self.process_request[protocol].append(processor)
481 ##                         self.process_request[protocol].sort()
482 ##                     else:
483 ##                         self.process_request[protocol] = [processor]
484 ##                     added = True
485 ##                     continue
486 ##             if added:
487 ##                 self.processors.append(processor)
488 ##                 # XXX base class sorts .handlers, but I have no idea why
489 ##                 #self.processors.sort()
490 ##                 processor.add_parent(self)
491
492         def _request(self, url_or_req, data):
493             if isstringlike(url_or_req):
494                 req = Request(url_or_req, data)
495             else:
496                 # already a urllib2.Request instance
497                 req = url_or_req
498                 if data is not None:
499                     req.add_data(data)
500             return req
501
502         def open(self, fullurl, data=None):
503             req = self._request(fullurl, data)
504             type = req.get_type()
505
506             # pre-process request
507             # XXX should we allow a Processor to change the type (URL
508             #   scheme) of the request?
509             meth_name = type+"_request"
510             for processor in self.process_request.get(type, []):
511                 meth = getattr(processor, meth_name)
512                 req = meth(req)
513
514             response = urllib2.OpenerDirector.open(self, req, data)
515
516             # post-process response
517             meth_name = type+"_response"
518             for processor in self.process_response.get(type, []):
519                 meth = getattr(processor, meth_name)
520                 response = meth(req, response)
521
522             return response
523
524 ##         def close(self):
525 ##             urllib2.OpenerDirector.close(self)
526 ##             for processor in self.processors:
527 ##                 processor.close()
528 ##             self.processors = []
529
530
531     # Note the absence of redirect and header-adding code here
532     # (AbstractHTTPHandler), and the lack of other clutter that would be
533     # here without Processors.
534     class AbstractHTTPHandler(urllib2.BaseHandler):
535         def do_open(self, http_class, req):
536             host = req.get_host()
537             if not host:
538                 raise URLError('no host given')
539
540             h = http_class(host) # will parse host:port
541             if ClientCookie.HTTP_DEBUG:
542                 h.set_debuglevel(1)
543
544             if req.has_data():
545                 h.putrequest('POST', req.get_selector())
546             else:
547                 h.putrequest('GET', req.get_selector())
548
549             for k, v in req.headers.items():
550                 h.putheader(k, v)
551             for k, v in req.unredirected_hdrs.items():
552                 h.putheader(k, v)
553
554             # httplib will attempt to connect() here.  be prepared
555             # to convert a socket error to a URLError.
556             try:
557                 h.endheaders()
558             except socket.error, err:
559                 raise URLError(err)
560             if req.has_data():
561                 h.send(req.get_data())
562
563             code, msg, hdrs = h.getreply()
564             fp = h.getfile()
565
566             response = urllib.addinfourl(fp, hdrs, req.get_full_url())
567             response.code = code
568             response.msg = msg
569
570             return response
571
572
573     # XXX would self.reset() work, instead of raising this exception?
574     class EndOfHeadError(Exception): pass
575     class HeadParser(htmllib.HTMLParser):
576         # only these elements are allowed in or before HEAD of document
577         head_elems = ("html", "head",
578                       "title", "base",
579                       "script", "style", "meta", "link", "object")
580         def __init__(self):
581             htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
582             self.http_equiv = []
583
584         def start_meta(self, attrs):
585             http_equiv = content = None
586             for key, value in attrs:
587                 if key == "http-equiv":
588                     http_equiv = value
589                 elif key == "content":
590                     content = value
591             if http_equiv is not None:
592                 self.http_equiv.append((http_equiv, content))
593
594         def handle_starttag(self, tag, method, attrs):
595             if tag in self.head_elems:
596                 method(attrs)
597             else:
598                 raise EndOfHeadError()
599
600         def handle_endtag(self, tag, method):
601             if tag in self.head_elems:
602                 method()
603             else:
604                 raise EndOfHeadError()
605
606         def end_head(self):
607             raise EndOfHeadError()
608
609     def parse_head(file):
610         """Return a list of key, value pairs."""
611         hp = HeadParser()
612         while 1:
613             data = file.read(CHUNK)
614             try:
615                 hp.feed(data)
616             except EndOfHeadError:
617                 break
618             if len(data) != CHUNK:
619                 # this should only happen if there is no HTML body, or if
620                 # CHUNK is big
621                 break
622         return hp.http_equiv
623
624
625     class HTTPHandler(AbstractHTTPHandler):
626         def http_open(self, req):
627             return self.do_open(httplib.HTTP, req)
628
629     if hasattr(httplib, 'HTTPS'):
630         class HTTPSHandler(AbstractHTTPHandler):
631             def https_open(self, req):
632                 return self.do_open(httplib.HTTPS, req)
633
634
635     def build_opener(*handlers):
636         """Create an opener object from a list of handlers and processors.
637
638         The opener will use several default handlers and processors, including
639         support for HTTP and FTP.  If there is a ProxyHandler, it must be at the
640         front of the list of handlers.  (Yuck.  This is fixed in 2.3.)
641
642         If any of the handlers passed as arguments are subclasses of the
643         default handlers, the default handlers will not be used.
644         """
645         opener = OpenerDirector()
646         default_classes = [
647             # handlers
648             urllib2.ProxyHandler,
649             urllib2.UnknownHandler,
650             HTTPHandler,  # from this module (derived from new AbstractHTTPHandler)
651             urllib2.HTTPDefaultErrorHandler,
652             HTTPRedirectHandler,  # from this module (bugfixed)
653             urllib2.FTPHandler,
654             urllib2.FileHandler,
655             # processors
656             HTTPRequestUpgradeProcessor,
657             #HTTPEquivProcessor,
658             #SeekableProcessor,
659             HTTPCookieProcessor,
660             #HTTPRefererProcessor,
661             HTTPStandardHeadersProcessor,
662             #HTTPRefreshProcessor,
663             HTTPErrorProcessor
664             ]
665         if hasattr(httplib, 'HTTPS'):
666             default_classes.append(HTTPSHandler)
667         skip = []
668         for klass in default_classes:
669             for check in handlers:
670                 if type(check) == types.ClassType:
671                     if issubclass(check, klass):
672                         skip.append(klass)
673                 elif type(check) == types.InstanceType:
674                     if isinstance(check, klass):
675                         skip.append(klass)
676         for klass in skip:
677             default_classes.remove(klass)
678
679         to_add = []
680         for klass in default_classes:
681             to_add.append(klass())
682         for h in handlers:
683             if type(h) == types.ClassType:
684                 h = h()
685             to_add.append(h)
686
687         for instance in to_add:
688             opener.add_handler(instance)
689 ##             # yuck
690 ##             if hasattr(instance, "processor_order"):
691 ##                 opener.add_processor(instance)
692 ##             else:
693 ##                 opener.add_handler(instance)
694
695         return opener
696
697
698     _opener = None
699     urlopen_lock = _threading.Lock()
700     def urlopen(url, data=None):
701         global _opener
702         if _opener is None:
703             urlopen_lock.acquire()
704             try:
705                 if _opener is None:
706                     _opener = build_opener()
707             finally:
708                 urlopen_lock.release()
709         return _opener.open(url, data)
710
711     def install_opener(opener):
712         global _opener
713         _opener = opener