;; -*- Mode: Lisp; -*- ;; --------------------------------------------------------------------------- ;; Title: CQuery --- Querying CSS selectors for Closure HTML and Closure XML ;; Created: 2014-08-16 ;; Author: Gilbert Baumann ;; License: MIT style (see below) ;; --------------------------------------------------------------------------- ;; (c) copyright 2014 by Gilbert Baumann ;; Permission is hereby granted, free of charge, to any person obtaining ;; a copy of this software and associated documentation files (the ;; "Software"), to deal in the Software without restriction, including ;; without limitation the rights to use, copy, modify, merge, publish, ;; distribute, sublicense, and/or sell copies of the Software, and to ;; permit persons to whom the Software is furnished to do so, subject to ;; the following conditions: ;; ;; The above copyright notice and this permission notice shall be ;; included in all copies or substantial portions of the Software. ;; ;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. (defpackage :de.bauhh.lquery (:use :common-lisp :parse) (:export :query) (:import-from :clex2 #:$$)) (in-package :de.bauhh.lquery) ;;;; ------------------------------------------------------------------------------------------ ;; Where is the compiler? ;; Shouldn't we go with a separate tokenizer? ;; Where is the style sheet parser? ;;;; -- Parser -------------------------------------------------------------------------------- ;; Here is an overview, what the SEXP syntax of CSS selectors is. ;; CSS Lisp ;; --------------------------------------------------------------- ;; A (:gi :a) ;; .foo (:class "foo") ;; #foo (:id "foo") ;; * T ;; ;; A B (>> A B) ;the cause of all evil ;; A + B (+ A B) ;; A > B (> A B) ;; A ~ B (~ A B) ;; ;; [attrib] (:attribute-exists attrib) ;; [a = b] (= a b) ;; [a ~= b] (~= a b) ;; [a |= b] (-= a b) ;; [a ^= b] (^= a b) ;; [a $= b] ($= a b) ;; :not(x) (not x) ;; :has(x) (has x) ;; :matches(x) (matches x) ;; :nth-xxx (:nth-xxx a b) ;as in an + b ;; :xyz (:pseudo-class :xyz) ;; ::xyz (:pseudo-element :xyz) ;; xyz (and (and x y) z) ;; x,y (or x y) ;; Pseudo classes implemented: ;; :empty ;; :first-child, :last-child, :only-child, :nth-child, :nth-last-child ;; :first-of-type, :last-of-type, :only-of-type, :nth-of-type, :nth-last-of-type ;; :not(), :has() (defun parse-selector (input) (parse (input) (start -> complex-selector-list => $1) (complex-selector-list -> complex-selector (* "," S* complex-selector => $3) => (if $2 (list* 'or $1 $2) $1)) (scope-relative-selector-list -> scope-relative-selector (* "," s* scope-relative-selector => $3) => (if $2 (list* 'or $1 $2) $1)) (scope-relative-selector -> complex-selector => `(>> (:pseudo-class :scope) ,$1) -> combinator complex-selector => (list $1 (list :pseudo-class :scope) $2)) (complex-selector -> complex-selector-1 S* => $1) (complex-selector-1 -> compound-selector => $1 -> complex-selector-1 combinator compound-selector => (list $2 $1 $3)) (combinator -> :s s* combinator-1 s* => $3 -> :s s* => '>> -> combinator-1 s* => $1) (combinator-1 -> "+" => $1 -> ">" => $1 -> "~" => $1 -> "||" => $1 -> "/" :ident "/" => (list :combinator $2)) #+(or) ;unused (compound-selector-list -> compound-selector-list-1 S* => $1) #+(or) ;unused (compound-selector-list-1 -> compound-selector => $1 -> compound-selector-list-1 "," S* compound-selector => (list 'or $1 $4)) (compound-selector -> type-selector (* (or id class attrib pseudo)) => (if $2 (list* 'and $1 $2) $1) -> (+ (or id class attrib pseudo)) => (if (cdr $1) (cons 'and $1) (car $1))) #+(or) ;unused (simple-selector-list -> simple-selector S* (* "," S* simple-selector) S* => (if $2 (list* 'or $1 $2) $1)) #+(or) ;unused (simple-selector -> type-selector -> id -> class -> attrib -> pseudo) (type-selector -> #| wqname-prefix? |# element-name) (element-name -> :ident => (list :gi $1) -> "*" => t) (id -> "#" :ident => (list :id $2)) (class -> "." :ident => (list :class $2)) (attrib -> "[" S* attrib-name "]" => (list :attribute-exists $3) -> "[" S* attrib-name attrib-match (or :ident :string) S* (? attrib-flags) "]" => `(,$4 ,$3 ,$5 ,@(if $7 (list $7)))) (attrib-name -> #| wqname-prefix? |# :ident S* => (intern (string-upcase $1) :keyword)) (attrib-match -> "=" => '= -> (or "^=" "$=" "*=" "~=" "|=") s* => $1) (attrib-flags -> :ident S*) (pseudo ;; '::' starts a pseudo-element, ':' a pseudo-class ;; Exceptions: :first-line, :first-letter, :before and :after. ;; Note that pseudo-elements are restricted to one per selector and ;; occur only in the last compound-selector. -> ":" (? ":") :ident => (list :pseudo-class (intern (string-upcase $3) :keyword)) -> ":" (? ":") functional-pseudo => $3 -> :not complex-selector-list ")" => (list 'not $2) -> :has scope-relative-selector-list ")" => (list 'has $2) -> :matches complex-selector-list ")" => (list 'matches $2)) (functional-pseudo -> :function S* value ")" => (list* :pseudo-class (intern (string-upcase $1) :keyword) $3)) (value -> (* (or :s :ident :number :dimension :percentage "+"))) ;?? (s* -> -> :s s*) ;; ----- Tokens ------ (:case-sensitive-p nil) (:macro ident (and (? #\-) nmstart (* nmchar))) (:macro nl (or #\newline (and #\return #\newline) #\return #\page)) (:macro w (* (or #\space #\tab #\return #\newline #\page))) (:macro unicode (and '"\\" "[0-9a-f]{1,6}" (? (or (and #\return #\newline) #\space #\newline #\return #\tab #\page)))) (:macro escape (or unicode (and '"\\" (- t (or "[0-9a-f]" #\newline #\return #\page))))) (:macro nonascii (- t (range #.(code-char 0) #.(code-char 127)))) (:macro nmstart (or "[_a-z]" nonascii escape)) (:macro nmchar (or "[_a-z0-9-]" nonascii escape)) (:macro plain-ident (and (? #\-) (or "[_a-z]" nonascii) (* (or "[_a-z0-9-]" nonascii)))) (:macro name (+ nmchar)) (:macro num (or "[0-9]+" "[0-9]*[.][0-9]+")) (:macro string1 (* (or (- T (or #\Newline #\Return #\Page #\")) (and #\\ nl) nonascii escape))) (:macro string2 (* (or (- T (or #\Newline #\Return #\Page #\')) (and #\\ nl) nonascii escape))) (:macro string (or string1 string2)) (:s -> (+ (or #\space #\tab #\return #\newline #\page))) ;comments? (:~= -> w '"~=" => '~=) ;'w' needed here, bug in the CSS spec (:\|= -> '"|=" => '\|=) (:^= -> '"^=" => '^=) (:$= -> '"$=" => '$=) (:*= -> '"*=" => '*=) (:ident -> plain-ident => $$) ;an identifier without escapes, for speed (:ident -> ident => (css-deescape $$)) ;; (:string -> #\" (= s (* (- T (or #\Newline #\Return #\Page #\" #\\)))) #\" => s) (:string -> #\' (= s (* (- T (or #\Newline #\Return #\Page #\' #\\)))) #\' => s) (:string -> #\" (= s string1) #\" => (css-deescape s)) (:string -> #\' (= s string2) #\' => (css-deescape s)) (:invalid -> #\" (= s string1)) (:invalid -> #\' (= s string2)) ;; (:function -> (= x ident) '"(" => x) (:number -> num => (read-from-string $$)) (:|#| -> '"#") (:|+| -> w '"+" => '+) (:|>| -> w '">" => '>) (:|<| -> w '"<" => '<) ;extension (:|,| -> w '",") (:|~| -> w '"~" => '~) (:not -> '":not(") (:has -> '":has(") ;CSS4 (:matches -> '":matches(") ;CSS4 ;; Fix for more of the white space issues (:|)| -> w '")") ;; CSS is not very orthogonal here. (:in :nth (:s -> (+ (or #\space #\tab #\return #\newline #\page))) (:integer -> (+ "[0-9]") => (parse-integer $$)) (:simple-tokens "+" "-" "n" "odd" "even") (:|)| -> '")" => (clex-deriv:begin :initial))) (:nth-child -> '":nth-child(" => (clex-deriv:begin :nth)) (:nth-last-child -> '":nth-last-child(" => (clex-deriv:begin :nth)) (:nth-of-type -> '":nth-of-type(" => (clex-deriv:begin :nth)) (:nth-last-of-type -> '":nth-last-of-type(" => (clex-deriv:begin :nth)) (:atkeyword -> '"@" (= x ident) => x) ;; (:invalid -> invalid) (:percentage -> (= n num) '"%" => (list :% (read-from-string n))) (:dimension -> (= n num) (= x ident) => (list (intern (string-upcase x) :keyword) (read-from-string n))) (:cdo -> '"") (:in (:initial :nth) ((and '"/*" (* (- t #\*)) (+ #\*) (* (and (- t (or #\/ #\*)) (* (- t #\*)) (+ #\*))) #\/))))) (defun css-deescape (string) "Process escape sequences in string according to CSS3." (with-output-to-string (bag) (clex2:lexing (string) (:case-sensitive-p nil) ((and '"\\" (= code-point "[0-9a-f]{1,6}") (? (or (and #\return #\newline) #\space #\newline #\return #\tab #\page))) (princ (code-char (parse-integer code-point :radix 16)) bag)) ((and '"\\" (= c (- t (or "[0-9a-f]" #\newline #\return #\page)))) ;; what escapes are defined? is \n = 'n' or newline? (princ c bag)) ((and '"\\" (or #\newline (and #\return #\newline) #\return #\page))) (t (princ $$ bag))))) (defun parse-css-selector (string) (parse-selector string)) ;;;; -- Matcher ------------------------------------------------------------------------------- (defparameter +white-space+ (list #\space #\tab #\newline #\return)) (defun css-white-space-p (char) (or (eql char #\space) (eql char #\newline) (eql char #.(code-char #x9)) ;tab (eql char #.(code-char #xA)) ;newline (eql char #.(code-char #xD)) ;carriage return (eql char #.(code-char #xC)))) ;formfeed (defun space-separated-list-member (item string &key (case-sensitive-p nil)) "When 'string' is taken as a space separated list of sub-strings, is 'item' one of those sub-strings?" ;; There is a catch: (space-separated-list-member "foo bar" "foo bar") must not be NIL. (let ((i 0) (p 0) (end (length string))) (loop (loop while (and (< i end) (css-white-space-p (char string i))) do (incf i)) (when (= i end) (return)) (setf p i) (loop while (and (< i end) (not (css-white-space-p (char string i)))) do (incf i)) (when (if case-sensitive-p (string= item string :start2 p :end2 i) (string-equal item string :start2 p :end2 i)) (return t))))) (defun dash-match-p (putative-prefix string &key (case-sensitive-p nil)) "Is 'string' either exactly 'putative-prefix' or is 'putative-prefix' a prefix of 'string' immediately followed by a dash?" (let ((p (mismatch putative-prefix string :key (if case-sensitive-p #'char= #'char-equal)))) (or (null p) (and (= p (length putative-prefix)) (< p (length string)) (char= #\- (char string p)))))) (defun prefixp (putative-prefix sequence &key (test #'eql)) (let ((end (length putative-prefix))) (= (or (mismatch putative-prefix sequence :test test) end) end))) (defun suffixp (putative-suffix sequence &key (test #'eql)) (= 0 (or (mismatch putative-suffix sequence :from-end t :test test) 0))) ;;;; -- Query Compiler ------------------------------------------------------------------------ (defun selector-and-flags (sel) (if (stringp sel) (setf sel (parse-css-selector sel))) (let ((flags nil) ;list of (name bind step) (i 0)) (labels ((genflag () (intern (format nil "FLAG-~D" (incf i)))) (morph (x) (cond ((atom x) x) ((case (car x) (>> (let ((flag (genflag))) (push (list flag `(or ,flag ,(morph (second x))) flag) flags) `(and ,flag ,(morph (third x))))) (> (let ((flag (genflag))) (push (list flag (morph (second x)) flag) flags) `(and ,flag ,(morph (third x))))) (+ (let ((flag (genflag))) (push (list flag nil (morph (second x))) flags) `(and ,flag ,(morph (third x))))) (~ (let ((flag (genflag))) (push (list flag nil `(or ,flag ,(morph (second x)))) flags) `(and ,flag ,(morph (third x))))) ((has) ;; don't x) (t (cons (car x) (mapcar #'morph (cdr x))))))))) (values (morph sel) flags)))) (defun compile-selector (selector) (multiple-value-bind (selector flags) (selector-and-flags selector) ;; No gensym here, you have no business in my package. `(LAMBDA (CONT NODE &KEY SCOPE) (LABELS ((WALK (NODE INDEX REMAINING-SIBLINGS CHILDREN SCOPE PARENT ,@(mapcar #'first flags)) (DECLARE (IGNORABLE NODE INDEX REMAINING-SIBLINGS CHILDREN SCOPE PARENT ,@(mapcar #'first flags))) (UNLESS (EQ :PCDATA (GI NODE)) ;these never match (WHEN ,(compile-predicate selector 'node 'index 'remaining-siblings 'children) (FUNCALL CONT NODE)) (LET ,(mapcar (lambda (flag-spec) (list (first flag-spec) (compile-predicate (second flag-spec) 'node 'index 'remaining-siblings 'children))) flags) (LET ((NCHILDREN (CHILDREN NODE))) (LOOP FOR NINDEX FROM 0 FOR NREMAINING-SIBLINGS ON NCHILDREN DO (PROGN (WALK (CAR NREMAINING-SIBLINGS) NINDEX NREMAINING-SIBLINGS NCHILDREN SCOPE NODE ,@(mapcar #'first flags)) (SETQ ,@(mapcan (lambda (flag-spec) (list (first flag-spec) ;; ??? (compile-predicate (third flag-spec) '(car nremaining-siblings) 'index ;??? 'remaining-siblings 'children))) flags))))))))) ;; Hmm this node has no parent, and now what? ;; what about "HTML:has(>HEAD)"? And do nested :has()s ascend the tree? ;; They do not since that parent cannot be matched. ;; ;; In fact :scope in :has can never match, since it is ;; prepended by default. But the spec [it's CSS after all] is ;; not clear about this. ;; (LET ((ROOT (LIST* :ROOT NIL (LIST NODE)))) ;### a fake root node. (WALK NODE 0 (LIST NODE) (LIST NODE) (OR SCOPE ROOT) ROOT ,@(mapcar (constantly nil) flags))))))) ;; CSS lulls you in soft words, but is very explicit: ;; | The relational pseudo-class, :has(), is a functional pseudo-class ;; | taking a relative selector list as an argument. It represents an ;; | element if any of the relative selectors, when absolutized and ;; | evaluated with the element as the :scope elements, would match at ;; | least one element. ;; Fine. Lets read about relative selectors: ;; | In a relative selector, ":scope" (the :scope pseudo-class ;; | followed by a space) is implied at the beginning of each complex ;; | selector that does not already contain the :scope pseudo-class. ;; The interesting thing here is: "that does not already contain the ;; :scope pseudo-class". ;; Huh? So is :has(:scope A) the same as :has(A)? What does "contain" ;; mean here? Contain where? What about ;; :has(:foo:bar:matches(:scope))? ;; Does this contain :scope? Or ;; :has(A B C :scope) ;; Does it? (defun compile-predicate (selector node-var index-var siblings-var children-var) (let ((tmp (gensym))) (cond ((symbolp selector) selector) ((atom selector) (error "Huh?")) ((ecase (car selector) ((AND) `(AND ,@(mapcar (lambda (x) (compile-predicate x node-var index-var siblings-var children-var)) (cdr selector)))) ((OR) `(OR ,@(mapcar (lambda (x) (compile-predicate x node-var index-var siblings-var children-var)) (cdr selector)))) ((not) `(NOT ,(compile-predicate (cadr selector) node-var index-var siblings-var children-var))) ((:gi) `(STRING-EQUAL (gi ,node-var) ,(cadr selector))) ((:id) `(EQUAL (attr ,node-var :id) ,(cadr selector))) ((:class) `(SPACE-SEPARATED-LIST-MEMBER ,(cadr selector) (attr ,node-var :class))) ((:attribute-exists) `(attr ,node-var ,(cadr selector))) ((*=) `(LET ((,tmp (attr ,node-var ,(cadr selector)))) (and ,tmp (search ,(caddr selector) ,tmp)))) ((~=) `(SPACE-SEPARATED-LIST-MEMBER ,(caddr selector) (attr ,node-var ,(cadr selector)))) ((^=) `(LET ((,tmp (attr ,node-var ,(cadr selector)))) (and ,tmp (prefixp ,(caddr selector) ,tmp)))) (($=) `(LET ((,tmp (attr ,node-var ,(cadr selector)))) (and ,tmp (suffixp ,(caddr selector) ,tmp)))) ((-=) `(LET ((,tmp (attr ,node-var ,(cadr selector)))) (and ,tmp (dash-match-p ,(caddr selector) ,tmp)))) ((=) `(LET ((,tmp (attr ,node-var ,(cadr selector)))) (and ,tmp (string= ,(caddr selector) ,tmp)))) ((<) `(and ,(compile-predicate (caddr selector) node-var index-var siblings-var children-var) (some (lambda (,tmp) ,(compile-predicate (cadr selector) tmp index-var siblings-var children-var)) (children ,node-var)))) ((has) ;; what about :has(> A) and friends? `(QUERY ',(cadr selector) PARENT :SCOPE ,node-var)) ((:pseudo-class) (ecase (cadr selector) (:first-child `(= ,index-var 0)) (:last-child `(NULL (CDR ,siblings-var))) (:empty `(NULL (CHILDREN ,node-var))) (:only-child `(AND (= ,index-var 0) (NULL (CDR ,siblings-var)))) (:first-of-type `(= ,index-var (POSITION (GI ,node-var) ,children-var :KEY #'GI))) (:last-of-type `(NOT (FIND (GI ,node-var) (CDR ,siblings-var) :KEY #'GI))) (:only-of-type `(= 1 (COUNT (GI ,node-var) ,children-var :KEY #'GI))) (:link (compile-predicate `(:attribute-exists :href) node-var index-var siblings-var children-var)) (:scope `(EQ ,node-var SCOPE)))) ((:nth-child) (destructuring-bind (a b) (cdr selector) `(AN+B-P ,index-var ,a ,b))) ((:nth-last-child) (destructuring-bind (a b) (cdr selector) `(AN+B-P (LENGTH (CDR ,siblings-var)) ,a ,b))) ((:nth-of-type) ;; this is not particular efficient (destructuring-bind (a b) (cdr selector) `(AN+B-P (POSITION ,node-var (REMOVE (GI ,node-var) ,children-var :KEY #'GI :TEST-NOT #'EQL)) ,a ,b))) ((:nth-last-of-type) ;; this is not particular efficient (destructuring-bind (a b) (cdr selector) `(AN+B-P (POSITION ,node-var (REVERSE (REMOVE (GI ,node-var) ,siblings-var :KEY #'GI :TEST-NOT #'EQL))) ,a ,b))) ((:pseudo-element) nil)))))) (defun an+b-p (i a b) (if (zerop a) (= i (- b 1)) (multiple-value-bind (q r) (floor (+ i (- b) 1) a) (and (zerop r) (>= q 0))))) (defun query (selector node &key scope) (let ((fun (compile nil (compile-selector selector)))) (let ((res nil)) (funcall fun (lambda (x) (push x res)) node :scope scope) (reverse res)))) (define-compiler-macro query (&whole whole selector node &key scope) (cond ((constantp selector) `(LET ((RES NIL)) (,(compile-selector (eval selector)) #'(LAMBDA (X) (PUSH X RES)) ,node :scope ,scope) (REVERSE RES))) (t whole))) ;;;; -- LML ----------------------------------------------------------------------------------- (defmethod children ((object string)) nil) (defmethod attr ((object string) attr &optional default) (declare (ignore attr)) default) (defmethod gi ((object string)) :pcdata) (defmethod children ((object cons)) (de.bauhh.lml:children object)) (defmethod attr ((object cons) attr &optional default) (de.bauhh.lml:attr object attr default)) (defmethod gi ((object cons)) (de.bauhh.lml:gi object))