1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
100 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding='unicode'))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
297 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
298 handle_failures=None):
299 """
300 Make all links in the document absolute, given the
301 ``base_url`` for the document (the full URL where the document
302 came from), or if no ``base_url`` is given, then the ``.base_url``
303 of the document.
304
305 If ``resolve_base_href`` is true, then any ``<base href>``
306 tags in the document are used *and* removed from the document.
307 If it is false then any such tag is ignored.
308
309 If ``handle_failures`` is None (default), a failure to process
310 a URL will abort the processing. If set to 'ignore', errors
311 are ignored. If set to 'discard', failing URLs will be removed.
312 """
313 if base_url is None:
314 base_url = self.base_url
315 if base_url is None:
316 raise TypeError(
317 "No base_url given, and the document has no base_url")
318 if resolve_base_href:
319 self.resolve_base_href()
320
321 if handle_failures == 'ignore':
322 def link_repl(href):
323 try:
324 return urljoin(base_url, href)
325 except ValueError:
326 return href
327 elif handle_failures == 'discard':
328 def link_repl(href):
329 try:
330 return urljoin(base_url, href)
331 except ValueError:
332 return None
333 elif handle_failures is None:
334 def link_repl(href):
335 return urljoin(base_url, href)
336 else:
337 raise ValueError(
338 "unexpected value for handle_failures: %r" % handle_failures)
339
340 self.rewrite_links(link_repl)
341
343 """
344 Find any ``<base href>`` tag in the document, and apply its
345 values to all links found in the document. Also remove the
346 tag once it has been applied.
347
348 If ``handle_failures`` is None (default), a failure to process
349 a URL will abort the processing. If set to 'ignore', errors
350 are ignored. If set to 'discard', failing URLs will be removed.
351 """
352 base_href = None
353 basetags = self.xpath('//base[@href]|//x:base[@href]',
354 namespaces={'x': XHTML_NAMESPACE})
355 for b in basetags:
356 base_href = b.get('href')
357 b.drop_tree()
358 if not base_href:
359 return
360 self.make_links_absolute(base_href, resolve_base_href=False,
361 handle_failures=handle_failures)
362
364 """
365 Yield (element, attribute, link, pos), where attribute may be None
366 (indicating the link is in the text). ``pos`` is the position
367 where the link occurs; often 0, but sometimes something else in
368 the case of links in stylesheets or style tags.
369
370 Note: <base href> is *not* taken into account in any way. The
371 link you get is exactly the link in the document.
372
373 Note: multiple links inside of a single text string or
374 attribute value are returned in reversed order. This makes it
375 possible to replace or delete them from the text string value
376 based on their reported text positions. Otherwise, a
377 modification at one text position can change the positions of
378 links reported later on.
379 """
380 link_attrs = defs.link_attrs
381 for el in self.iter(etree.Element):
382 attribs = el.attrib
383 tag = _nons(el.tag)
384 if tag == 'object':
385 codebase = None
386
387
388 if 'codebase' in attribs:
389 codebase = el.get('codebase')
390 yield (el, 'codebase', codebase, 0)
391 for attrib in ('classid', 'data'):
392 if attrib in attribs:
393 value = el.get(attrib)
394 if codebase is not None:
395 value = urljoin(codebase, value)
396 yield (el, attrib, value, 0)
397 if 'archive' in attribs:
398 for match in _archive_re.finditer(el.get('archive')):
399 value = match.group(0)
400 if codebase is not None:
401 value = urljoin(codebase, value)
402 yield (el, 'archive', value, match.start())
403 else:
404 for attrib in link_attrs:
405 if attrib in attribs:
406 yield (el, attrib, attribs[attrib], 0)
407 if tag == 'meta':
408 http_equiv = attribs.get('http-equiv', '').lower()
409 if http_equiv == 'refresh':
410 content = attribs.get('content', '')
411 i = content.find(';')
412 url = content[i+1:] if i >= 0 else content
413 if url[:4].lower() == 'url=':
414 url = url[4:]
415
416
417
418 if url:
419 url, pos = _unquote_match(url, i + 5)
420 yield (el, 'content', url, pos)
421 elif tag == 'param':
422 valuetype = el.get('valuetype') or ''
423 if valuetype.lower() == 'ref':
424
425
426
427
428
429
430 yield (el, 'value', el.get('value'), 0)
431 elif tag == 'style' and el.text:
432 urls = [
433
434 _unquote_match(match.group(1), match.start(1))[::-1]
435 for match in _iter_css_urls(el.text)
436 ] + [
437 (match.start(1), match.group(1))
438 for match in _iter_css_imports(el.text)
439 ]
440 if urls:
441
442
443
444 urls.sort(reverse=True)
445 for start, url in urls:
446 yield (el, None, url, start)
447 if 'style' in attribs:
448 urls = list(_iter_css_urls(attribs['style']))
449 if urls:
450
451 for match in urls[::-1]:
452 url, start = _unquote_match(match.group(1), match.start(1))
453 yield (el, 'style', url, start)
454
455 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
456 base_href=None):
457 """
458 Rewrite all the links in the document. For each link
459 ``link_repl_func(link)`` will be called, and the return value
460 will replace the old link.
461
462 Note that links may not be absolute (unless you first called
463 ``make_links_absolute()``), and may be internal (e.g.,
464 ``'#anchor'``). They can also be values like
465 ``'mailto:email'`` or ``'javascript:expr'``.
466
467 If you give ``base_href`` then all links passed to
468 ``link_repl_func()`` will take that into account.
469
470 If the ``link_repl_func`` returns None, the attribute or
471 tag text will be removed completely.
472 """
473 if base_href is not None:
474
475
476 self.make_links_absolute(
477 base_href, resolve_base_href=resolve_base_href)
478 elif resolve_base_href:
479 self.resolve_base_href()
480
481 for el, attrib, link, pos in self.iterlinks():
482 new_link = link_repl_func(link.strip())
483 if new_link == link:
484 continue
485 if new_link is None:
486
487 if attrib is None:
488 el.text = ''
489 else:
490 del el.attrib[attrib]
491 continue
492
493 if attrib is None:
494 new = el.text[:pos] + new_link + el.text[pos+len(link):]
495 el.text = new
496 else:
497 cur = el.get(attrib)
498 if not pos and len(cur) == len(link):
499 new = new_link
500 else:
501 new = cur[:pos] + new_link + cur[pos+len(link):]
502 el.set(attrib, new)
503
504
506 """
507 An object that represents a method on an element as a function;
508 the function takes either an element or an HTML string. It
509 returns whatever the function normally returns, or if the function
510 works in-place (and so returns None) it returns a serialized form
511 of the resulting document.
512 """
518 result_type = type(doc)
519 if isinstance(doc, basestring):
520 if 'copy' in kw:
521 raise TypeError(
522 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
523 doc = fromstring(doc, **kw)
524 else:
525 if 'copy' in kw:
526 make_a_copy = kw.pop('copy')
527 else:
528 make_a_copy = self.copy
529 if make_a_copy:
530 doc = copy.deepcopy(doc)
531 meth = getattr(doc, self.name)
532 result = meth(*args, **kw)
533
534 if result is None:
535
536 return _transform_result(result_type, doc)
537 else:
538 return result
539
540 find_rel_links = _MethodFunc('find_rel_links', copy=False)
541 find_class = _MethodFunc('find_class', copy=False)
542 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
543 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
544 iterlinks = _MethodFunc('iterlinks', copy=False)
545 rewrite_links = _MethodFunc('rewrite_links', copy=True)
546
549
552
555
558
559
561 """A lookup scheme for HTML Element classes.
562
563 To create a lookup instance with different Element classes, pass a tag
564 name mapping of Element classes in the ``classes`` keyword argument and/or
565 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
566 The special key '*' denotes a Mixin class that should be mixed into all
567 Element classes.
568 """
569 _default_element_classes = {}
570
571 - def __init__(self, classes=None, mixins=None):
588
589 - def lookup(self, node_type, document, namespace, name):
600
601
602
603
604
605 _looks_like_full_html_unicode = re.compile(
606 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
607 _looks_like_full_html_bytes = re.compile(
608 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
609
622
625 """
626 Parses several HTML elements, returning a list of elements.
627
628 The first item in the list may be a string (though leading
629 whitespace is removed). If no_leading_text is true, then it will
630 be an error if there is leading text, and it will always be a list
631 of only elements.
632
633 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
634 """
635 if parser is None:
636 parser = html_parser
637
638 if isinstance(html, bytes):
639 if not _looks_like_full_html_bytes(html):
640
641 html = ('<html><body>'.encode('ascii') + html +
642 '</body></html>'.encode('ascii'))
643 else:
644 if not _looks_like_full_html_unicode(html):
645 html = '<html><body>%s</body></html>' % html
646 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
647 assert _nons(doc.tag) == 'html'
648 bodies = [e for e in doc if _nons(e.tag) == 'body']
649 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
650 body = bodies[0]
651 elements = []
652 if no_leading_text and body.text and body.text.strip():
653 raise etree.ParserError(
654 "There is leading text: %r" % body.text)
655 if body.text and body.text.strip():
656 elements.append(body.text)
657 elements.extend(body)
658
659
660 return elements
661
664 """
665 Parses a single HTML element; it is an error if there is more than
666 one element, or if anything but whitespace precedes or follows the
667 element.
668
669 If create_parent is true (or is a tag name) then a parent node
670 will be created to encapsulate the HTML in a single element. In
671 this case, leading or trailing text is allowed.
672
673 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
674 """
675 if parser is None:
676 parser = html_parser
677
678 accept_leading_text = bool(create_parent)
679
680 elements = fragments_fromstring(
681 html, parser=parser, no_leading_text=not accept_leading_text,
682 base_url=base_url, **kw)
683
684 if create_parent:
685 if not isinstance(create_parent, basestring):
686 create_parent = 'div'
687 new_root = Element(create_parent)
688 if elements:
689 if isinstance(elements[0], basestring):
690 new_root.text = elements[0]
691 del elements[0]
692 new_root.extend(elements)
693 return new_root
694
695 if not elements:
696 raise etree.ParserError('No elements found')
697 if len(elements) > 1:
698 raise etree.ParserError(
699 "Multiple elements found (%s)"
700 % ', '.join([_element_name(e) for e in elements]))
701 el = elements[0]
702 if el.tail and el.tail.strip():
703 raise etree.ParserError(
704 "Element followed by text: %r" % el.tail)
705 el.tail = None
706 return el
707
708 -def fromstring(html, base_url=None, parser=None, **kw):
774
775 -def parse(filename_or_url, parser=None, base_url=None, **kw):
776 """
777 Parse a filename, URL, or file-like object into an HTML document
778 tree. Note: this returns a tree, not an element. Use
779 ``parse(...).getroot()`` to get the document root.
780
781 You can override the base URL with the ``base_url`` keyword. This
782 is most useful when parsing from a file-like object.
783 """
784 if parser is None:
785 parser = html_parser
786 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
787
795
797 if isinstance(el, etree.CommentBase):
798 return 'comment'
799 elif isinstance(el, basestring):
800 return 'string'
801 else:
802 return _nons(el.tag)
803
804
805
806
807
912
913 HtmlElementClassLookup._default_element_classes['form'] = FormElement
914
951
953 if not url:
954 raise ValueError("cannot submit, no URL provided")
955
956 try:
957 from urllib import urlencode, urlopen
958 except ImportError:
959 from urllib.request import urlopen
960 from urllib.parse import urlencode
961 if method == 'GET':
962 if '?' in url:
963 url += '&'
964 else:
965 url += '?'
966 url += urlencode(values)
967 data = None
968 else:
969 data = urlencode(values)
970 return urlopen(url, data)
971
973
981 raise KeyError(
982 "You cannot remove keys from ElementDict")
986 return item in self.inputs
991
993 return '<%s for form %s>' % (
994 self.__class__.__name__,
995 self.inputs.form._name())
996
1062
1090
1091 -class TextareaElement(InputMixin, HtmlElement):
1092 """
1093 ``<textarea>`` element. You can get the name with ``.name`` and
1094 get/set the value with ``.value``
1095 """
1096
1097 - def _value__get(self):
1098 """
1099 Get/set the value (which is the contents of this element)
1100 """
1101 content = self.text or ''
1102 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1103 serialisation_method = 'xml'
1104 else:
1105 serialisation_method = 'html'
1106 for el in self:
1107
1108 content += etree.tostring(
1109 el, method=serialisation_method, encoding='unicode')
1110 return content
1111 - def _value__set(self, value):
1112 del self[:]
1113 self.text = value
1114 - def _value__del(self):
1115 self.text = ''
1116 del self[:]
1117 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1118
1119 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1120
1122 """
1123 ``<select>`` element. You can get the name with ``.name``.
1124
1125 ``.value`` will be the value of the selected option, unless this
1126 is a multi-select element (``<select multiple>``), in which case
1127 it will be a set-like object. In either case ``.value_options``
1128 gives the possible values.
1129
1130 The boolean attribute ``.multiple`` shows if this is a
1131 multi-select.
1132 """
1133
1135 """
1136 Get/set the value of this select (the selected option).
1137
1138 If this is a multi-select, this is a set-like object that
1139 represents all the selected options.
1140 """
1141 if self.multiple:
1142 return MultipleSelectOptions(self)
1143 for el in _options_xpath(self):
1144 if el.get('selected') is not None:
1145 value = el.get('value')
1146 if value is None:
1147 value = el.text or ''
1148 if value:
1149 value = value.strip()
1150 return value
1151 return None
1152
1154 if self.multiple:
1155 if isinstance(value, basestring):
1156 raise TypeError(
1157 "You must pass in a sequence")
1158 self.value.clear()
1159 self.value.update(value)
1160 return
1161 if value is not None:
1162 value = value.strip()
1163 for el in _options_xpath(self):
1164 opt_value = el.get('value')
1165 if opt_value is None:
1166 opt_value = el.text or ''
1167 if opt_value:
1168 opt_value = opt_value.strip()
1169 if opt_value == value:
1170 checked_option = el
1171 break
1172 else:
1173 raise ValueError(
1174 "There is no option with the value of %r" % value)
1175 for el in _options_xpath(self):
1176 if 'selected' in el.attrib:
1177 del el.attrib['selected']
1178 if value is not None:
1179 checked_option.set('selected', '')
1180
1187
1188 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1189
1204 value_options = property(value_options, doc=value_options.__doc__)
1205
1207 """
1208 Boolean attribute: is there a ``multiple`` attribute on this element.
1209 """
1210 return 'multiple' in self.attrib
1212 if value:
1213 self.set('multiple', '')
1214 elif 'multiple' in self.attrib:
1215 del self.attrib['multiple']
1216 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1217
1218 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1219
1221 """
1222 Represents all the selected options in a ``<select multiple>`` element.
1223
1224 You can add to this set-like option to select an option, or remove
1225 to unselect the option.
1226 """
1227
1229 self.select = select
1230
1232 """
1233 Iterator of all the ``<option>`` elements.
1234 """
1235 return iter(_options_xpath(self.select))
1236 options = property(options)
1237
1239 for option in self.options:
1240 if 'selected' in option.attrib:
1241 opt_value = option.get('value')
1242 if opt_value is None:
1243 opt_value = option.text or ''
1244 if opt_value:
1245 opt_value = opt_value.strip()
1246 yield opt_value
1247
1248 - def add(self, item):
1249 for option in self.options:
1250 opt_value = option.get('value')
1251 if opt_value is None:
1252 opt_value = option.text or ''
1253 if opt_value:
1254 opt_value = opt_value.strip()
1255 if opt_value == item:
1256 option.set('selected', '')
1257 break
1258 else:
1259 raise ValueError(
1260 "There is no option with the value %r" % item)
1261
1263 for option in self.options:
1264 opt_value = option.get('value')
1265 if opt_value is None:
1266 opt_value = option.text or ''
1267 if opt_value:
1268 opt_value = opt_value.strip()
1269 if opt_value == item:
1270 if 'selected' in option.attrib:
1271 del option.attrib['selected']
1272 else:
1273 raise ValueError(
1274 "The option %r is not currently selected" % item)
1275 break
1276 else:
1277 raise ValueError(
1278 "There is not option with the value %r" % item)
1279
1281 return '<%s {%s} for select name=%r>' % (
1282 self.__class__.__name__,
1283 ', '.join([repr(v) for v in self]),
1284 self.select.name)
1285
1287 """
1288 This object represents several ``<input type=radio>`` elements
1289 that have the same name.
1290
1291 You can use this like a list, but also use the property
1292 ``.value`` to check/uncheck inputs. Also you can use
1293 ``.value_options`` to get the possible values.
1294 """
1295
1297 """
1298 Get/set the value, which checks the radio with that value (and
1299 unchecks any other value).
1300 """
1301 for el in self:
1302 if 'checked' in el.attrib:
1303 return el.get('value')
1304 return None
1305
1307 if value is not None:
1308 for el in self:
1309 if el.get('value') == value:
1310 checked_option = el
1311 break
1312 else:
1313 raise ValueError(
1314 "There is no radio input with the value %r" % value)
1315 for el in self:
1316 if 'checked' in el.attrib:
1317 del el.attrib['checked']
1318 if value is not None:
1319 checked_option.set('checked', '')
1320
1323
1324 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1325
1327 """
1328 Returns a list of all the possible values.
1329 """
1330 return [el.get('value') for el in self]
1331 value_options = property(value_options, doc=value_options.__doc__)
1332
1334 return '%s(%s)' % (
1335 self.__class__.__name__,
1336 list.__repr__(self))
1337
1339 """
1340 Represents a group of checkboxes (``<input type=checkbox>``) that
1341 have the same name.
1342
1343 In addition to using this like a list, the ``.value`` attribute
1344 returns a set-like object that you can add to or remove from to
1345 check and uncheck checkboxes. You can also use ``.value_options``
1346 to get the possible values.
1347 """
1348
1350 """
1351 Return a set-like object that can be modified to check or
1352 uncheck individual checkboxes according to their value.
1353 """
1354 return CheckboxValues(self)
1364 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1365
1367 """
1368 Returns a list of all the possible values.
1369 """
1370 return [el.get('value') for el in self]
1371 value_options = property(value_options, doc=value_options.__doc__)
1372
1374 return '%s(%s)' % (
1375 self.__class__.__name__, list.__repr__(self))
1376
1378
1379 """
1380 Represents the values of the checked checkboxes in a group of
1381 checkboxes with the same name.
1382 """
1383
1386
1388 return iter([
1389 el.get('value')
1390 for el in self.group
1391 if 'checked' in el.attrib])
1392
1393 - def add(self, value):
1394 for el in self.group:
1395 if el.get('value') == value:
1396 el.set('checked', '')
1397 break
1398 else:
1399 raise KeyError("No checkbox with value %r" % value)
1400
1402 for el in self.group:
1403 if el.get('value') == value:
1404 if 'checked' in el.attrib:
1405 del el.attrib['checked']
1406 else:
1407 raise KeyError(
1408 "The checkbox with value %r was already unchecked" % value)
1409 break
1410 else:
1411 raise KeyError(
1412 "No checkbox with value %r" % value)
1413
1415 return '<%s {%s} for checkboxes name=%r>' % (
1416 self.__class__.__name__,
1417 ', '.join([repr(v) for v in self]),
1418 self.group.name)
1419
1503
1504 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1505
1507 """
1508 Represents a ``<label>`` element.
1509
1510 Label elements are linked to other elements with their ``for``
1511 attribute. You can access this element with ``label.for_element``.
1512 """
1513
1515 """
1516 Get/set the element this label points to. Return None if it
1517 can't be found.
1518 """
1519 id = self.get('for')
1520 if not id:
1521 return None
1522 return self.body.get_element_by_id(id)
1524 id = other.get('id')
1525 if not id:
1526 raise TypeError(
1527 "Element %r has no id attribute" % other)
1528 self.set('for', id)
1532 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1533 doc=_for_element__get.__doc__)
1534
1535 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1536
1537
1538
1539
1540
1554
1556 """Convert all tags in an XHTML tree to HTML by removing their
1557 XHTML namespace.
1558 """
1559 try:
1560 xhtml = xhtml.getroot()
1561 except AttributeError:
1562 pass
1563 prefix = "{%s}" % XHTML_NAMESPACE
1564 prefix_len = len(prefix)
1565 for el in xhtml.iter(prefix + "*"):
1566 el.tag = el.tag[prefix_len:]
1567
1568
1569
1570 __str_replace_meta_content_type = re.compile(
1571 r'<meta http-equiv="Content-Type"[^>]*>').sub
1572 __bytes_replace_meta_content_type = re.compile(
1573 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1574
1575 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1576 encoding=None, method="html", with_tail=True, doctype=None):
1577 """Return an HTML string representation of the document.
1578
1579 Note: if include_meta_content_type is true this will create a
1580 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1581 regardless of the value of include_meta_content_type any existing
1582 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1583
1584 The ``encoding`` argument controls the output encoding (defauts to
1585 ASCII, with &#...; character references for any characters outside
1586 of ASCII). Note that you can pass the name ``'unicode'`` as
1587 ``encoding`` argument to serialise to a Unicode string.
1588
1589 The ``method`` argument defines the output method. It defaults to
1590 'html', but can also be 'xml' for xhtml output, or 'text' to
1591 serialise to plain text without markup.
1592
1593 To leave out the tail text of the top-level element that is being
1594 serialised, pass ``with_tail=False``.
1595
1596 The ``doctype`` option allows passing in a plain string that will
1597 be serialised before the XML tree. Note that passing in non
1598 well-formed content here will make the XML output non well-formed.
1599 Also, an existing doctype in the document tree will not be removed
1600 when serialising an ElementTree instance.
1601
1602 Example::
1603
1604 >>> from lxml import html
1605 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1606
1607 >>> html.tostring(root)
1608 b'<p>Hello<br>world!</p>'
1609 >>> html.tostring(root, method='html')
1610 b'<p>Hello<br>world!</p>'
1611
1612 >>> html.tostring(root, method='xml')
1613 b'<p>Hello<br/>world!</p>'
1614
1615 >>> html.tostring(root, method='text')
1616 b'Helloworld!'
1617
1618 >>> html.tostring(root, method='text', encoding='unicode')
1619 u'Helloworld!'
1620
1621 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1622 >>> html.tostring(root[0], method='text', encoding='unicode')
1623 u'Helloworld!TAIL'
1624
1625 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1626 u'Helloworld!'
1627
1628 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1629 >>> html.tostring(doc, method='html', encoding='unicode')
1630 u'<html><body><p>Hello<br>world!</p></body></html>'
1631
1632 >>> print(html.tostring(doc, method='html', encoding='unicode',
1633 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1634 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1635 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1636 <html><body><p>Hello<br>world!</p></body></html>
1637 """
1638 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1639 encoding=encoding, with_tail=with_tail,
1640 doctype=doctype)
1641 if method == 'html' and not include_meta_content_type:
1642 if isinstance(html, str):
1643 html = __str_replace_meta_content_type('', html)
1644 else:
1645 html = __bytes_replace_meta_content_type(bytes(), html)
1646 return html
1647
1648 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1649
1651 """
1652 Open the HTML document in a web browser, saving it to a temporary
1653 file to open it. Note that this does not delete the file after
1654 use. This is mainly meant for debugging.
1655 """
1656 import os
1657 import webbrowser
1658 import tempfile
1659 if not isinstance(doc, etree._ElementTree):
1660 doc = etree.ElementTree(doc)
1661 handle, fn = tempfile.mkstemp(suffix='.html')
1662 f = os.fdopen(handle, 'wb')
1663 try:
1664 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1665 finally:
1666
1667 f.close()
1668 url = 'file://' + fn.replace(os.path.sep, '/')
1669 print(url)
1670 webbrowser.open(url)
1671
1672
1673
1674
1675
1677 """An HTML parser that is configured to return lxml.html Element
1678 objects.
1679 """
1683
1685 """An XML parser that is configured to return lxml.html Element
1686 objects.
1687
1688 Note that this parser is not really XHTML aware unless you let it
1689 load a DTD that declares the HTML entities. To do this, make sure
1690 you have the XHTML DTDs installed in your catalogs, and create the
1691 parser like this::
1692
1693 >>> parser = XHTMLParser(load_dtd=True)
1694
1695 If you additionally want to validate the document, use this::
1696
1697 >>> parser = XHTMLParser(dtd_validation=True)
1698
1699 For catalog support, see http://www.xmlsoft.org/catalog.html.
1700 """
1704
1706 """Create a new HTML Element.
1707
1708 This can also be used for XHTML documents.
1709 """
1710 v = html_parser.makeelement(*args, **kw)
1711 return v
1712
1713 html_parser = HTMLParser()
1714 xhtml_parser = XHTMLParser()
1715