1 """Integration with Python standard library module urllib2.
3 Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
4 the META HTTP-EQUIV tag contents, and following Refresh header redirects.
6 Copyright 2002-2003 John J Lee <jjl@pobox.com>
8 This code is free software; you can redistribute it and/or modify it under
9 the terms of the BSD License (see the file COPYING included with the
17 from _ClientCookie import CookieJar, request_host
18 from _Util import isstringlike
19 from _Debug import _debug
26 CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
29 from urllib2 import AbstractHTTPHandler
33 import urlparse, urllib2, urllib, httplib, htmllib, formatter, string
34 from urllib2 import URLError, HTTPError
35 import types, string, socket
36 from cStringIO import StringIO
37 from _Util import seek_wrapper
40 _threading = threading; del threading
42 import dummy_threading
43 _threading = dummy_threading; del dummy_threading
45 # This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
46 # (http://www.python.org/sf/549151)
47 # 2.2.3 is broken here (my fault!), 2.3 is fixed.
48 class HTTPRedirectHandler(urllib2.BaseHandler):
49 # maximum number of redirections before assuming we're in a loop
52 # Implementation notes:
54 # To avoid the server sending us into an infinite loop, the request
55 # object needs to track what URLs we have already seen. Do this by
56 # adding a handler-specific attribute to the Request object. The value
57 # of the dict is used to count the number of times the same url has
58 # been visited. This is needed because this isn't necessarily a loop:
59 # there is more than one way to redirect (Refresh, 302, 303, 307).
61 # Another handler-specific Request attribute, original_url, is used to
62 # remember the URL of the original request so that it is possible to
63 # decide whether or not RFC 2965 cookies should be turned on during
66 # Always unhandled redirection codes:
67 # 300 Multiple Choices: should not handle this here.
68 # 304 Not Modified: no need to handle here: only of interest to caches
69 # that do conditional GETs
70 # 305 Use Proxy: probably not worth dealing with here
71 # 306 Unused: what was this for in the previous versions of protocol??
73 def redirect_request(self, newurl, req, fp, code, msg, headers):
74 """Return a Request or None in response to a redirect.
76 This is called by the http_error_30x methods when a redirection
77 response is received. If a redirection should take place, return a
78 new Request to allow http_error_30x to perform the redirect;
79 otherwise, return None to indicate that an HTTPError should be
83 if code in (301, 302, 303) or (code == 307 and not req.has_data()):
84 # Strictly (according to RFC 2616), 301 or 302 in response to
85 # a POST MUST NOT cause a redirection without confirmation
86 # from the user (of urllib2, in this case). In practice,
87 # essentially all clients do redirect in this case, so we do
89 return Request(newurl, headers=req.headers)
91 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
93 def http_error_302(self, req, fp, code, msg, headers):
94 if headers.has_key('location'):
95 newurl = headers['location']
96 elif headers.has_key('uri'):
97 newurl = headers['uri']
100 newurl = urlparse.urljoin(req.get_full_url(), newurl)
102 # XXX Probably want to forget about the state of the current
103 # request, although that might interact poorly with other
104 # handlers that also use handler-specific request attributes
105 new = self.redirect_request(newurl, req, fp, code, msg, headers)
109 # remember where we started from
110 if hasattr(req, "original_url"):
111 new.original_url = req.original_url
113 new.original_url = req.get_full_url()
116 # .error_302_dict[(url, code)] is number of times url
117 # previously visited as a result of a redirection with this
118 # code (error_30x_dict would be a better name).
119 new.origin_req_host = req.origin_req_host
120 if not hasattr(req, 'error_302_dict'):
121 new.error_302_dict = req.error_302_dict = {(newurl, code): 1}
123 ed = new.error_302_dict = req.error_302_dict
124 nr_visits = ed.get((newurl, code), 0)
125 # Refreshes generate fake 302s, so we can hit the same URL as
126 # a result of the same redirection code twice without
127 # necessarily being in a loop! So, allow two visits to each
128 # URL as a result of each redirection code.
129 if len(ed) < self.max_redirections and nr_visits < 2:
130 ed[(newurl, code)] = nr_visits + 1
132 raise HTTPError(req.get_full_url(), code,
133 self.inf_msg + msg, headers, fp)
135 if ClientCookie.REDIRECT_DEBUG:
136 _debug("redirecting to %s", newurl)
138 # Don't close the fp until we are sure that we won't use it
143 return self.parent.open(new)
145 http_error_301 = http_error_303 = http_error_307 = http_error_302
147 inf_msg = "The HTTP server returned a redirect error that would " \
148 "lead to an infinite loop.\n" \
149 "The last 30x error message was:\n"
152 class Request(urllib2.Request):
153 def __init__(self, url, data=None, headers={}):
154 urllib2.Request.__init__(self, url, data, headers)
155 self.unredirected_hdrs = {}
157 def add_unredirected_header(self, key, val):
158 # these headers do not persist from one request to the next in a chain
160 self.unredirected_hdrs[string.capitalize(key)] = val
162 def has_key(self, header_name):
163 if (self.headers.has_key(header_name) or
164 self.unredirected_hdrs.has_key(header_name)):
168 def get(self, header_name, failobj=None):
169 if self.headers.has_key(header_name):
170 return self.headers[header_name]
171 if self.unredirected_headers.has_key(header_name):
172 return self.unredirected_headers[header_name]
177 processor_order = 500
179 def add_parent(self, parent):
183 def __lt__(self, other):
184 if not hasattr(other, "processor_order"):
186 return self.processor_order < other.processor_order
188 class HTTPRequestUpgradeProcessor(BaseProcessor):
189 # upgrade Request to class with support for headers that don't get
191 processor_order = 0 # before anything else
193 def http_request(self, request):
194 if not hasattr(request, "add_unredirected_header"):
195 request = Request(request._Request__original, request.data,
199 https_request = http_request
201 class HTTPEquivProcessor(BaseProcessor):
202 """Append META HTTP-EQUIV headers to regular HTTP headers."""
203 def http_response(self, request, response):
204 if not hasattr(response, "seek"):
205 response = seek_wrapper(response)
206 # grab HTTP-EQUIV headers and add them to the true HTTP headers
207 headers = response.info()
208 for hdr, val in parse_head(response):
213 https_response = http_response
215 # XXX ATM this only takes notice of http responses -- probably
216 # should be independent of protocol scheme (http, ftp, etc.)
217 class SeekableProcessor(BaseProcessor):
218 """Make responses seekable."""
220 def http_response(self, request, response):
221 if not hasattr(response, "seek"):
222 return seek_wrapper(response)
225 https_response = http_response
227 # XXX if this gets added to urllib2, unverifiable would end up as an
228 # attribute on Request.
229 class HTTPCookieProcessor(BaseProcessor):
230 """Handle HTTP cookies."""
231 def __init__(self, cookies=None):
233 cookies = CookieJar()
234 self.cookies = cookies
236 def _unverifiable(self, request):
237 if hasattr(request, "error_302_dict") and request.error_302_dict:
242 (hasattr(request, "unverifiable") and request.unverifiable)):
248 def http_request(self, request):
249 unverifiable = self._unverifiable(request)
251 # Stuff request-host of this origin transaction into Request
252 # object, because we need to know it to know whether cookies
253 # should be in operation during derived requests (redirects,
254 # specifically -- including refreshes).
255 request.origin_req_host = request_host(request)
256 self.cookies.add_cookie_header(request, unverifiable)
259 def http_response(self, request, response):
260 unverifiable = self._unverifiable(request)
261 self.cookies.extract_cookies(response, request, unverifiable)
264 https_request = http_request
265 https_response = http_response
267 class HTTPRefererProcessor(BaseProcessor):
268 """Add Referer header to requests.
270 This only makes sense if you use each RefererProcessor for a single
271 chain of requests only (so, for example, if you use a single
272 HTTPRefererProcessor to fetch a series of URLs extracted from a single
273 page, this will break).
279 def http_request(self, request):
280 if ((self.referer is not None) and
281 not request.has_key("Referer")):
282 request.add_unredirected_header("Referer", self.referer)
285 def http_response(self, request, response):
286 self.referer = response.geturl()
289 https_request = http_request
290 https_response = http_response
292 class HTTPStandardHeadersProcessor(BaseProcessor):
293 def http_request(self, request):
294 host = request.get_host()
296 raise URLError('no host given')
298 if request.has_data(): # POST
299 data = request.get_data()
300 if not request.has_key('Content-type'):
301 request.add_unredirected_header(
303 'application/x-www-form-urlencoded')
304 if not request.has_key('Content-length'):
305 request.add_unredirected_header(
306 'Content-length', '%d' % len(data))
308 scheme, sel = urllib.splittype(request.get_selector())
309 sel_host, sel_path = urllib.splithost(sel)
310 if not request.has_key('Host'):
311 request.add_unredirected_header('Host', sel_host or host)
312 for name, value in self.parent.addheaders:
313 name = string.capitalize(name)
314 if not request.has_key(name):
315 request.add_unredirected_header(name, value)
319 https_request = http_request
321 class HTTPResponseDebugProcessor(BaseProcessor):
322 processor_order = 900 # before redirections, after everything else
324 def http_response(self, request, response):
325 if not hasattr(response, "seek"):
326 response = seek_wrapper(response)
327 _debug(response.read())
328 _debug("*****************************************************")
332 https_response = http_response
334 class HTTPRefreshProcessor(BaseProcessor):
335 """Perform HTTP Refresh redirections.
337 Note that if a non-200 HTTP code has occurred (for example, a 30x
338 redirect), this processor will do nothing.
340 By default, only zero-time Refresh headers are redirected. Use the
341 max_time constructor argument to allow Refresh with longer pauses.
342 Use the honor_time argument to control whether the requested pause
343 is honoured (with a time.sleep()) or skipped in favour of immediate
347 processor_order = 1000
349 def __init__(self, max_time=0, honor_time=True):
350 self.max_time = max_time
351 self.honor_time = honor_time
353 def http_response(self, request, response):
354 code, msg, hdrs = response.code, response.msg, response.info()
356 if code == 200 and hdrs.has_key("refresh"):
357 refresh = hdrs["refresh"]
358 i = string.find(refresh, ";")
360 pause, newurl_spec = refresh[:i], refresh[i+1:]
361 i = string.find(newurl_spec, "=")
364 if pause <= self.max_time:
365 if pause != 0 and self.honor_time:
367 newurl = newurl_spec[i+1:]
368 # fake a 302 response
369 hdrs["location"] = newurl
370 response = self.parent.error(
371 'http', request, response, 302, msg, hdrs)
375 https_response = http_response
377 class HTTPErrorProcessor(BaseProcessor):
378 """Process non-200 HTTP error responses.
380 This just passes the job on to the Handler.<proto>_error_<code>
381 methods, via the OpenerDirector.error method.
384 processor_order = 1000
386 def http_response(self, request, response):
387 code, msg, hdrs = response.code, response.msg, response.info()
390 response = self.parent.error(
391 'http', request, response, code, msg, hdrs)
395 https_response = http_response
398 class OpenerDirector(urllib2.OpenerDirector):
399 # XXX might be useful to have remove_processor, too (say you want to
400 # set a new RefererProcessor, but keep the old CookieProcessor --
401 # could always just create everything anew, though (using old
402 # CookieJar object to create CookieProcessor)
404 urllib2.OpenerDirector.__init__(self)
405 #self.processors = []
406 self.process_response = {}
407 self.process_request = {}
409 def add_handler(self, handler):
412 # the same handler could be added twice without detection
414 for meth in dir(handler.__class__):
415 if meth[-5:] == '_open':
417 if self.handle_open.has_key(protocol):
418 self.handle_open[protocol].append(handler)
419 self.handle_open[protocol].sort()
421 self.handle_open[protocol] = [handler]
424 i = string.find(meth, '_')
425 j = string.find(meth[i+1:], '_') + i + 1
426 if j != -1 and meth[i+1:j] == 'error':
433 dict = self.handle_error.get(proto, {})
434 if dict.has_key(kind):
435 dict[kind].append(handler)
438 dict[kind] = [handler]
439 self.handle_error[proto] = dict
442 if meth[-9:] == "_response":
444 if self.process_response.has_key(protocol):
445 self.process_response[protocol].append(handler)
446 self.process_response[protocol].sort()
448 self.process_response[protocol] = [handler]
451 elif meth[-8:] == "_request":
453 if self.process_request.has_key(protocol):
454 self.process_request[protocol].append(handler)
455 self.process_request[protocol].sort()
457 self.process_request[protocol] = [handler]
461 self.handlers.append(handler)
463 handler.add_parent(self)
465 ## def add_processor(self, processor):
467 ## for meth in dir(processor):
468 ## if meth[-9:] == "_response":
469 ## protocol = meth[:-9]
470 ## if self.process_response.has_key(protocol):
471 ## self.process_response[protocol].append(processor)
472 ## self.process_response[protocol].sort()
474 ## self.process_response[protocol] = [processor]
477 ## elif meth[-8:] == "_request":
478 ## protocol = meth[:-8]
479 ## if self.process_request.has_key(protocol):
480 ## self.process_request[protocol].append(processor)
481 ## self.process_request[protocol].sort()
483 ## self.process_request[protocol] = [processor]
487 ## self.processors.append(processor)
488 ## # XXX base class sorts .handlers, but I have no idea why
489 ## #self.processors.sort()
490 ## processor.add_parent(self)
492 def _request(self, url_or_req, data):
493 if isstringlike(url_or_req):
494 req = Request(url_or_req, data)
496 # already a urllib2.Request instance
502 def open(self, fullurl, data=None):
503 req = self._request(fullurl, data)
504 type = req.get_type()
506 # pre-process request
507 # XXX should we allow a Processor to change the type (URL
508 # scheme) of the request?
509 meth_name = type+"_request"
510 for processor in self.process_request.get(type, []):
511 meth = getattr(processor, meth_name)
514 response = urllib2.OpenerDirector.open(self, req, data)
516 # post-process response
517 meth_name = type+"_response"
518 for processor in self.process_response.get(type, []):
519 meth = getattr(processor, meth_name)
520 response = meth(req, response)
525 ## urllib2.OpenerDirector.close(self)
526 ## for processor in self.processors:
528 ## self.processors = []
531 # Note the absence of redirect and header-adding code here
532 # (AbstractHTTPHandler), and the lack of other clutter that would be
533 # here without Processors.
534 class AbstractHTTPHandler(urllib2.BaseHandler):
535 def do_open(self, http_class, req):
536 host = req.get_host()
538 raise URLError('no host given')
540 h = http_class(host) # will parse host:port
541 if ClientCookie.HTTP_DEBUG:
545 h.putrequest('POST', req.get_selector())
547 h.putrequest('GET', req.get_selector())
549 for k, v in req.headers.items():
551 for k, v in req.unredirected_hdrs.items():
554 # httplib will attempt to connect() here. be prepared
555 # to convert a socket error to a URLError.
558 except socket.error, err:
561 h.send(req.get_data())
563 code, msg, hdrs = h.getreply()
566 response = urllib.addinfourl(fp, hdrs, req.get_full_url())
573 # XXX would self.reset() work, instead of raising this exception?
574 class EndOfHeadError(Exception): pass
575 class HeadParser(htmllib.HTMLParser):
576 # only these elements are allowed in or before HEAD of document
577 head_elems = ("html", "head",
579 "script", "style", "meta", "link", "object")
581 htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
584 def start_meta(self, attrs):
585 http_equiv = content = None
586 for key, value in attrs:
587 if key == "http-equiv":
589 elif key == "content":
591 if http_equiv is not None:
592 self.http_equiv.append((http_equiv, content))
594 def handle_starttag(self, tag, method, attrs):
595 if tag in self.head_elems:
598 raise EndOfHeadError()
600 def handle_endtag(self, tag, method):
601 if tag in self.head_elems:
604 raise EndOfHeadError()
607 raise EndOfHeadError()
609 def parse_head(file):
610 """Return a list of key, value pairs."""
613 data = file.read(CHUNK)
616 except EndOfHeadError:
618 if len(data) != CHUNK:
619 # this should only happen if there is no HTML body, or if
625 class HTTPHandler(AbstractHTTPHandler):
626 def http_open(self, req):
627 return self.do_open(httplib.HTTP, req)
629 if hasattr(httplib, 'HTTPS'):
630 class HTTPSHandler(AbstractHTTPHandler):
631 def https_open(self, req):
632 return self.do_open(httplib.HTTPS, req)
635 def build_opener(*handlers):
636 """Create an opener object from a list of handlers and processors.
638 The opener will use several default handlers and processors, including
639 support for HTTP and FTP. If there is a ProxyHandler, it must be at the
640 front of the list of handlers. (Yuck. This is fixed in 2.3.)
642 If any of the handlers passed as arguments are subclasses of the
643 default handlers, the default handlers will not be used.
645 opener = OpenerDirector()
648 urllib2.ProxyHandler,
649 urllib2.UnknownHandler,
650 HTTPHandler, # from this module (derived from new AbstractHTTPHandler)
651 urllib2.HTTPDefaultErrorHandler,
652 HTTPRedirectHandler, # from this module (bugfixed)
656 HTTPRequestUpgradeProcessor,
660 #HTTPRefererProcessor,
661 HTTPStandardHeadersProcessor,
662 #HTTPRefreshProcessor,
665 if hasattr(httplib, 'HTTPS'):
666 default_classes.append(HTTPSHandler)
668 for klass in default_classes:
669 for check in handlers:
670 if type(check) == types.ClassType:
671 if issubclass(check, klass):
673 elif type(check) == types.InstanceType:
674 if isinstance(check, klass):
677 default_classes.remove(klass)
680 for klass in default_classes:
681 to_add.append(klass())
683 if type(h) == types.ClassType:
687 for instance in to_add:
688 opener.add_handler(instance)
690 ## if hasattr(instance, "processor_order"):
691 ## opener.add_processor(instance)
693 ## opener.add_handler(instance)
699 urlopen_lock = _threading.Lock()
700 def urlopen(url, data=None):
703 urlopen_lock.acquire()
706 _opener = build_opener()
708 urlopen_lock.release()
709 return _opener.open(url, data)
711 def install_opener(opener):