1 """CSS Selectors based on XPath.
2
3 This module supports selecting XML/HTML tags based on CSS selectors.
4 See the `CSSSelector` class for details.
5 """
6
7 import re
8 from lxml import etree
9
10 __all__ = ['SelectorSyntaxError', 'ExpressionError',
11 'CSSSelector']
12
13 try:
14 _basestring = basestring
15 except NameError:
16 _basestring = str
17
20
23
25 """A CSS selector.
26
27 Usage::
28
29 >>> from lxml import etree, cssselect
30 >>> select = cssselect.CSSSelector("a tag > child")
31
32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
33 >>> [ el.tag for el in select(root) ]
34 ['child']
35 """
40
42 return '<%s %s for %r>' % (
43 self.__class__.__name__,
44 hex(abs(id(self)))[2:],
45 self.css)
46
47
48
49
50 try:
51 _unicode = unicode
52 except NameError:
53
54 _unicode = str
55
58 obj = _unicode.__new__(cls, contents)
59 obj.pos = pos
60 return obj
61
63 return '%s(%s, %r)' % (
64 self.__class__.__name__,
65 _unicode.__repr__(self),
66 self.pos)
67
70
73
76
77
78
79
80
81
82
83
85 """
86 Represents selector.class_name
87 """
88
89 - def __init__(self, selector, class_name):
90 self.selector = selector
91 self.class_name = class_name
92
94 return '%s[%r.%s]' % (
95 self.__class__.__name__,
96 self.selector,
97 self.class_name)
98
100 sel_xpath = self.selector.xpath()
101 sel_xpath.add_condition(
102 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' '))
103 return sel_xpath
104
106 """
107 Represents selector:name(expr)
108 """
109
110 unsupported = [
111 'target', 'lang', 'enabled', 'disabled',]
112
113 - def __init__(self, selector, type, name, expr):
114 self.selector = selector
115 self.type = type
116 self.name = name
117 self.expr = expr
118
120 return '%s[%r%s%s(%r)]' % (
121 self.__class__.__name__,
122 self.selector,
123 self.type, self.name, self.expr)
124
136
139 a, b = parse_series(expr)
140 if not a and not b and not last:
141
142 xpath.add_condition('false() and position() = 0')
143 return xpath
144 if add_name_test:
145 xpath.add_name_test()
146 xpath.add_star_prefix()
147 if a == 0:
148 if last:
149 b = 'last() - %s' % b
150 xpath.add_condition('position() = %s' % b)
151 return xpath
152 if last:
153
154 a = -a
155 b = -b
156 if b > 0:
157 b_neg = str(-b)
158 else:
159 b_neg = '+%s' % (-b)
160 if a != 1:
161 expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
162 else:
163 expr = []
164 if b >= 0:
165 expr.append('position() >= %s' % b)
166 elif b < 0 and last:
167 expr.append('position() < (last() %s)' % b)
168 expr = ' and '.join(expr)
169 if expr:
170 xpath.add_condition(expr)
171 return xpath
172
173
174
175
176
177
178
179
182
188
191
200
208
211
212 ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
213 ns.prefix = 'css'
214 ns['lower-case'] = _make_lower_case
215
217 """
218 Represents selector:ident
219 """
220
221 unsupported = ['indeterminate', 'first-line', 'first-letter',
222 'selection', 'before', 'after', 'link', 'visited',
223 'active', 'focus', 'hover']
224
225 - def __init__(self, element, type, ident):
226 self.element = element
227 assert type in (':', '::')
228 self.type = type
229 self.ident = ident
230
232 return '%s[%r%s%s]' % (
233 self.__class__.__name__,
234 self.element,
235 self.type, self.ident)
236
249
251
252 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
253 return xpath
254
256
257 raise NotImplementedError
258
264
270
278
286
292
294 if xpath.element == '*':
295 raise NotImplementedError(
296 "*:only-of-type is not implemented")
297 xpath.add_condition('last() = 1')
298 return xpath
299
303
305 """
306 Represents selector[namespace|attrib operator value]
307 """
308
309 - def __init__(self, selector, namespace, attrib, operator, value):
310 self.selector = selector
311 self.namespace = namespace
312 self.attrib = attrib
313 self.operator = operator
314 self.value = value
315
317 if self.operator == 'exists':
318 return '%s[%r[%s]]' % (
319 self.__class__.__name__,
320 self.selector,
321 self._format_attrib())
322 else:
323 return '%s[%r[%s %s %r]]' % (
324 self.__class__.__name__,
325 self.selector,
326 self._format_attrib(),
327 self.operator,
328 self.value)
329
335
337
338 if self.namespace == '*':
339 return '@' + self.attrib
340 else:
341 return '@%s:%s' % (self.namespace, self.attrib)
342
344 path = self.selector.xpath()
345 attrib = self._xpath_attrib()
346 value = self.value
347 if self.operator == 'exists':
348 assert not value
349 path.add_condition(attrib)
350 elif self.operator == '=':
351 path.add_condition('%s = %s' % (attrib,
352 xpath_repr(value)))
353 elif self.operator == '!=':
354
355 if value:
356 path.add_condition('not(%s) or %s != %s'
357 % (attrib, attrib, xpath_repr(value)))
358 else:
359 path.add_condition('%s != %s'
360 % (attrib, xpath_repr(value)))
361
362 elif self.operator == '~=':
363 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' ')))
364 elif self.operator == '|=':
365
366 path.add_condition('%s = %s or starts-with(%s, %s)' % (
367 attrib, xpath_repr(value),
368 attrib, xpath_repr(value + '-')))
369 elif self.operator == '^=':
370 path.add_condition('starts-with(%s, %s)' % (
371 attrib, xpath_repr(value)))
372 elif self.operator == '$=':
373
374 path.add_condition('substring(%s, string-length(%s)-%s) = %s'
375 % (attrib, attrib, len(value)-1, xpath_repr(value)))
376 elif self.operator == '*=':
377
378 path.add_condition('contains(%s, %s)' % (
379 attrib, xpath_repr(value)))
380 else:
381 assert 0, ("Unknown operator: %r" % self.operator)
382 return path
383
385 """
386 Represents namespace|element
387 """
388
389 - def __init__(self, namespace, element):
390 self.namespace = namespace
391 self.element = element
392
394 return '%s[%s]' % (
395 self.__class__.__name__,
396 self._format_element())
397
403
405 if self.namespace == '*':
406 el = self.element.lower()
407 else:
408
409 el = '%s:%s' % (self.namespace, self.element)
410 return XPathExpr(element=el)
411
413 """
414 Represents selector#id
415 """
416
418 self.selector = selector
419 self.id = id
420
422 return '%s[%r#%s]' % (
423 self.__class__.__name__,
424 self.selector, self.id)
425
430
432
436 return '%s(%r)' % (
437 self.__class__.__name__,
438 self.items)
439
443
445
446 _method_mapping = {
447 ' ': 'descendant',
448 '>': 'child',
449 '+': 'direct_adjacent',
450 '~': 'indirect_adjacent',
451 }
452
453 - def __init__(self, selector, combinator, subselector):
454 assert selector is not None
455 self.selector = selector
456 self.combinator = combinator
457 self.subselector = subselector
458
460 if self.combinator == ' ':
461 comb = '<followed>'
462 else:
463 comb = self.combinator
464 return '%s[%r %s %r]' % (
465 self.__class__.__name__,
466 self.selector,
467 comb,
468 self.subselector)
469
478
483
488
495
500
501
502
503
504 _el_re = re.compile(r'^\w+\s*$')
505 _id_re = re.compile(r'^(\w*)#(\w+)\s*$')
506 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$')
507
509 if isinstance(css_expr, _basestring):
510 match = _el_re.search(css_expr)
511 if match is not None:
512 return '%s%s' % (prefix, match.group(0).strip())
513 match = _id_re.search(css_expr)
514 if match is not None:
515 return "%s%s[@id = '%s']" % (
516 prefix, match.group(1) or '*', match.group(2))
517 match = _class_re.search(css_expr)
518 if match is not None:
519 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (
520 prefix, match.group(1) or '*', match.group(2))
521 css_expr = parse(css_expr)
522 expr = css_expr.xpath()
523 assert expr is not None, (
524 "Got None for xpath expression from %s" % repr(css_expr))
525 if prefix:
526 expr.add_prefix(prefix)
527 return str(expr)
528
530
531 - def __init__(self, prefix=None, path=None, element='*', condition=None,
532 star_prefix=False):
533 self.prefix = prefix
534 self.path = path
535 self.element = element
536 self.condition = condition
537 self.star_prefix = star_prefix
538
549
551 return '%s[%s]' % (
552 self.__class__.__name__, self)
553
555 if self.condition:
556 self.condition = '%s and (%s)' % (self.condition, condition)
557 else:
558 self.condition = condition
559
561 if self.path is None:
562 self.path = self.element
563 else:
564 self.path += self.element
565 self.element = part
566
572
574 if self.element == '*':
575
576 return
577 self.add_condition("name() = %s" % xpath_repr(self.element))
578 self.element = '*'
579
581 """
582 Adds a /* prefix if there is no prefix. This is when you need
583 to keep context's constrained to a single parent.
584 """
585 if self.path:
586 self.path += '*/'
587 else:
588 self.path = '*/'
589 self.star_prefix = True
590
591 - def join(self, combiner, other):
603
605 """
606 Represents |'d expressions. Note that unfortunately it isn't
607 the union, it's the sum, so duplicate elements will appear.
608 """
609
610 - def __init__(self, items, prefix=None):
615
619
628
629
630
631
642
644 result = []
645 while 1:
646 result.append(parse_selector(stream))
647 if stream.peek() == ',':
648 stream.next()
649 else:
650 break
651 if len(result) == 1:
652 return result[0]
653 else:
654 return Or(result)
655
670
672 peek = stream.peek()
673 if peek != '*' and not isinstance(peek, Symbol):
674 element = namespace = '*'
675 else:
676 next = stream.next()
677 if next != '*' and not isinstance(next, Symbol):
678 raise SelectorSyntaxError(
679 "Expected symbol, got %r" % next)
680 if stream.peek() == '|':
681 namespace = next
682 stream.next()
683 element = stream.next()
684 if element != '*' and not isinstance(next, Symbol):
685 raise SelectorSyntaxError(
686 "Expected symbol, got %r" % next)
687 else:
688 namespace = '*'
689 element = next
690 result = Element(namespace, element)
691 has_hash = False
692 while 1:
693 peek = stream.peek()
694 if peek == '#':
695 if has_hash:
696
697
698 break
699 stream.next()
700 result = Hash(result, stream.next())
701 has_hash = True
702 continue
703 elif peek == '.':
704 stream.next()
705 result = Class(result, stream.next())
706 continue
707 elif peek == '[':
708 stream.next()
709 result = parse_attrib(result, stream)
710 next = stream.next()
711 if not next == ']':
712 raise SelectorSyntaxError(
713 "] expected, got %r" % next)
714 continue
715 elif peek == ':' or peek == '::':
716 type = stream.next()
717 ident = stream.next()
718 if not isinstance(ident, Symbol):
719 raise SelectorSyntaxError(
720 "Expected symbol, got %r" % ident)
721 if stream.peek() == '(':
722 stream.next()
723 peek = stream.peek()
724 if isinstance(peek, String):
725 selector = stream.next()
726 elif isinstance(peek, Symbol) and is_int(peek):
727 selector = int(stream.next())
728 else:
729
730 selector = parse_simple_selector(stream)
731 next = stream.next()
732 if not next == ')':
733 raise SelectorSyntaxError(
734 "Expected ), got %r and %r"
735 % (next, selector))
736 result = Function(result, type, ident, selector)
737 else:
738 result = Pseudo(result, type, ident)
739 continue
740 else:
741 if peek == ' ':
742 stream.next()
743 break
744
745 return result
746
748 try:
749 int(v)
750 except ValueError:
751 return False
752 else:
753 return True
754
756 attrib = stream.next()
757 if stream.peek() == '|':
758 namespace = attrib
759 stream.next()
760 attrib = stream.next()
761 else:
762 namespace = '*'
763 if stream.peek() == ']':
764 return Attrib(selector, namespace, attrib, 'exists', None)
765 op = stream.next()
766 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
767 raise SelectorSyntaxError(
768 "Operator expected, got %r" % op)
769 value = stream.next()
770 if not isinstance(value, (Symbol, String)):
771 raise SelectorSyntaxError(
772 "Expected string or symbol, got %r" % value)
773 return Attrib(selector, namespace, attrib, op, value)
774
776 """
777 Parses things like '1n+2', or 'an+b' generally, returning (a, b)
778 """
779 if isinstance(s, Element):
780 s = s._format_element()
781 if not s or s == '*':
782
783 return (0, 0)
784 if isinstance(s, int):
785
786 return (0, s)
787 if s == 'odd':
788 return (2, 1)
789 elif s == 'even':
790 return (2, 0)
791 elif s == 'n':
792 return (1, 0)
793 if 'n' not in s:
794
795 return (0, int(s))
796 a, b = s.split('n', 1)
797 if not a:
798 a = 1
799 elif a == '-' or a == '+':
800 a = int(a+'1')
801 else:
802 a = int(a)
803 if not b:
804 b = 0
805 elif b == '-' or b == '+':
806 b = int(b+'1')
807 else:
808 b = int(b)
809 return (a, b)
810
811
812
813
814
815
816 _whitespace_re = re.compile(r'\s+')
817
818 _comment_re = re.compile(r'/\*.*?\*/', re.S)
819
820 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?')
821
823 pos = 0
824 s = _comment_re.sub('', s)
825 while 1:
826 match = _whitespace_re.match(s, pos=pos)
827 if match:
828 preceding_whitespace_pos = pos
829 pos = match.end()
830 else:
831 preceding_whitespace_pos = 0
832 if pos >= len(s):
833 return
834 match = _count_re.match(s, pos=pos)
835 if match and match.group() != 'n':
836 sym = s[pos:match.end()]
837 yield Symbol(sym, pos)
838 pos = match.end()
839 continue
840 c = s[pos]
841 c2 = s[pos:pos+2]
842 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
843 yield Token(c2, pos)
844 pos += 2
845 continue
846 if c in '>+~,.*=[]()|:#':
847 if c in '.#' and preceding_whitespace_pos > 0:
848 yield Token(' ', preceding_whitespace_pos)
849 yield Token(c, pos)
850 pos += 1
851 continue
852 if c == '"' or c == "'":
853
854 old_pos = pos
855 sym, pos = tokenize_escaped_string(s, pos)
856 yield String(sym, old_pos)
857 continue
858 old_pos = pos
859 sym, pos = tokenize_symbol(s, pos)
860 yield Symbol(sym, old_pos)
861 continue
862
864 quote = s[pos]
865 assert quote in ('"', "'")
866 pos = pos+1
867 start = pos
868 while 1:
869 next = s.find(quote, pos)
870 if next == -1:
871 raise SelectorSyntaxError(
872 "Expected closing %s for string in: %r"
873 % (quote, s[start:]))
874 result = s[start:next]
875 try:
876 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
877 except UnicodeDecodeError:
878
879 pos = next+1
880 else:
881 return result, next+1
882
883 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
884
886 start = pos
887 match = _illegal_symbol.search(s, pos=pos)
888 if not match:
889
890 return s[start:], len(s)
891 if match.start() == pos:
892 assert 0, (
893 "Unexpected symbol: %r at %s" % (s[pos], pos))
894 if not match:
895 result = s[start:]
896 pos = len(s)
897 else:
898 result = s[start:match.start()]
899 pos = match.start()
900 try:
901 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
902 except UnicodeDecodeError:
903 e = sys.exc_info()[1]
904 raise SelectorSyntaxError(
905 "Bad symbol %r: %s" % (result, e))
906 return result, pos
907
909
910 - def __init__(self, tokens, source=None):
911 self.used = []
912 self.tokens = iter(tokens)
913 self.source = source
914 self.peeked = None
915 self._peeking = False
916 try:
917 self.next_token = self.tokens.next
918 except AttributeError:
919
920 self.next_token = self.tokens.__next__
921
923 if self._peeking:
924 self._peeking = False
925 self.used.append(self.peeked)
926 return self.peeked
927 else:
928 try:
929 next = self.next_token()
930 self.used.append(next)
931 return next
932 except StopIteration:
933 return None
934
937
939 if not self._peeking:
940 try:
941 self.peeked = self.next_token()
942 except StopIteration:
943 return None
944 self._peeking = True
945 return self.peeked
946