;;; -*- xMode:Common-Lisp; Package:USER; Base:10 -*- ;;; ;;; This code was written by: ;;; ;;; Lawrence E. Freil ;;; National Science Center Foundation ;;; Augusta, Georgia 30909 ;;; ;;; If you modify this code, please comment your modifications ;;; clearly and inform the author of any improvements so they ;;; can be incorporated in future releases. ;;; ;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression ;;; parser. ;;; ;;; This regular expression parser operates by taking a ;;; regular expression and breaking it down into a list ;;; consisting of lisp expressions and flags. The list ;;; of lisp expressions is then taken in turned into a ;;; lambda expression that can be later applied to a ;;; string argument for parsing. ;;; o Spend EVAL-WHEN and moved DEFVARs up. ;;; o Also: *REGEX-GROUPINGS* and *REGEX-GROUPINGS* ;;; are now bind in REGEX for MP safety. ;;; o Spend DEFPACKAGE and IN-PACKAGE. ;;; o Added a cache for REGEX-COMPILE. ;;; o Code is compiled to machine code always. ;;; --GB 2002-09-01 (defpackage :nregex (:use :common-lisp) (:export #:regex #:regex-compile #:*regex-groupings* #:*regex-groups*)) (in-package :nregex) ;;; ;;; First we create a copy of macros to help debug the beast (eval-when (eval compile load) (defvar *regex-debug* nil)) ; Set to nil for no debugging code (defmacro info (message &rest args) (if *regex-debug* `(format *standard-output* ,message ,@args))) ;;; ;;; Declare the global variables for storing the paren index list. ;;; (defvar *regex-groups* (make-array 10)) (defvar *regex-groupings* 0) ;;; ;;; Declare a simple interface for testing. You probably wouldn't want ;;; to use this interface unless you were just calling this once. ;;; (defun regex (expression string) "Usage: (regex [ :anchored (t/nil) ]) This function take a regular expression (supplied as source) and compiles this into a lambda list that a string argument can then be applied to. It is also possible to compile this lambda list for better performance or to save it as a named function for later use" (cond ((gethash (list source anchored) *regex-hash*)) (t (when (> (hash-table-count *regex-hash*) *regex-hash-max-size*) (clrhash *regex-hash*)) (setf (gethash (list source anchored) *regex-hash*) (compile nil (regex-compile-1 source :anchored anchored)))))) ;;; ;;; Now for the main regex compiler routine. ;;; (defun regex-compile-1 (source &key (anchored nil)) "Usage: (regex-compile [ :anchored (t/nil) ]) This function take a regular expression (supplied as source) and compiles this into a lambda list that a string argument can then be applied to. It is also possible to compile this lambda list for better performance or to save it as a named function for later use" (info "Now entering regex-compile with \"~A\"~%" source) ;; ;; This routine works in two parts. ;; The first pass take the regular expression and produces a list of ;; operators and lisp expressions for the entire regular expression. ;; The second pass takes this list and produces the lambda expression. (let ((expression '()) ; holder for expressions (group 1) ; Current group index (group-stack nil) ; Stack of current group endings (result nil) ; holder for built expression. (fast-first nil)) ; holder for quick unanchored scan ;; ;; If the expression was an empty string then it alway ;; matches (so lets leave early) ;; (if (= (length source) 0) (return-from regex-compile-1 '(lambda (&rest args) (declare (ignore args)) t))) ;; ;; If the first character is a caret then set the anchored ;; flags and remove if from the expression string. ;; (cond ((eql (char source 0) #\^) (setf source (subseq source 1)) (setf anchored t))) ;; ;; If the first sequence is .* then also set the anchored flags. ;; (This is purely for optimization, it will work without this). ;; (if (>= (length source) 2) (if (string= source ".*" :start1 0 :end1 2) (setf anchored t))) ;; ;; Also, If this is not an anchored search and the first character is ;; a literal, then do a quick scan to see if it is even in the string. ;; If not then we can issue a quick nil, ;; otherwise we can start the search at the matching character to skip ;; the checks of the non-matching characters anyway. ;; ;; If I really wanted to speed up this section of code it would be ;; easy to recognize the case of a fairly long multi-character literal ;; and generate a Boyer-Moore search for the entire literal. ;; ;; I generate the code to do a loop because on CMU Lisp this is about ;; twice as fast a calling position. ;; (if (and (not anchored) (not (position (char source 0) *regex-special-chars*)) (not (and (> (length source) 1) (position (char source 1) *regex-special-chars*)))) (setf fast-first `((if (not (dotimes (i length nil) (if (eql (char string i) ,(char source 0)) (return (setf start i))))) (return-from final-return nil))))) ;; ;; Generate the very first expression to save the starting index ;; so that group 0 will be the entire string matched always ;; (add-exp '((setf (aref *regex-groups* 0) (list index nil)))) ;; ;; Loop over each character in the regular expression building the ;; expression list as we go. ;; (do ((eindex 0 (1+ eindex))) ((= eindex (length source))) (let ((current (char source eindex))) (info "Now processing character ~A index = ~A~%" current eindex) (case current ((#\.) ;; ;; Generate code for a single wild character ;; (add-exp '((if (>= index length) (return-from compare nil) (incf index))))) ((#\$) ;; ;; If this is the last character of the expression then ;; anchor the end of the expression, otherwise let it slide ;; as a standard character (even though it should be quoted). ;; (if (= eindex (1- (length source))) (add-exp '((if (not (= index length)) (return-from compare nil)))) (add-exp '((if (not (and (< index length) (eql (char string index) #\$))) (return-from compare nil) (incf index)))))) ((#\*) (add-exp '(ASTRISK))) ((#\+) (add-exp '(PLUS))) ((#\?) (add-exp '(QUESTION))) ((#\() ;; ;; Start a grouping. ;; (incf group) (push group group-stack) (add-exp `((setf (aref *regex-groups* ,(1- group)) (list index nil)))) (add-exp `(,group))) ((#\)) ;; ;; End a grouping ;; (let ((group (pop group-stack))) (add-exp `((setf (cadr (aref *regex-groups* ,(1- group))) index))) (add-exp `(,(- group))))) ((#\[) ;; ;; Start of a range operation. ;; Generate a bit-vector that has one bit per possible character ;; and then on each character or range, set the possible bits. ;; ;; If the first character is carat then invert the set. (let* ((invert (eql (char source (1+ eindex)) #\^)) (bitstring (make-array 256 :element-type 'bit :initial-element (if invert 1 0))) (set-char (if invert 0 1))) (if invert (incf eindex)) (do ((x (1+ eindex) (1+ x))) ((eql (char source x) #\]) (setf eindex x)) (info "Building range with character ~A~%" (char source x)) (cond ((and (eql (char source (1+ x)) #\-) (not (eql (char source (+ x 2)) #\]))) (if (>= (char-code (char source x)) (char-code (char source (+ 2 x)))) (error "Invalid range \"~A-~A\". Ranges must be in acending order" (char source x) (char source (+ 2 x)))) (do ((j (char-code (char source x)) (1+ j))) ((> j (char-code (char source (+ 2 x)))) (incf x 2)) (info "Setting bit for char ~A code ~A~%" (code-char j) j) (setf (sbit bitstring j) set-char))) (t (cond ((not (eql (char source x) #\])) (let ((char (char source x))) ;; ;; If the character is quoted then find out what ;; it should have been ;; (if (eql (char source x) #\\ ) (let ((length)) (multiple-value-setq (char length) (regex-quoted (subseq source x) invert)) (incf x length))) (info "Setting bit for char ~A code ~A~%" char (char-code char)) (if (not (vectorp char)) (setf (sbit bitstring (char-code (char source x))) set-char) (bit-ior bitstring char t)))))))) (add-exp `((let ((range ,bitstring)) (if (>= index length) (return-from compare nil)) (if (= 1 (sbit range (char-code (char string index)))) (incf index) (return-from compare nil))))))) ((#\\ ) ;; ;; Intreprete the next character as a special, range, octal, group or ;; just the character itself. ;; (let ((length) (value)) (multiple-value-setq (value length) (regex-quoted (subseq source (1+ eindex)) nil)) (cond ((listp value) (add-exp value)) ((characterp value) (add-exp `((if (not (and (< index length) (eql (char string index) ,value))) (return-from compare nil) (incf index))))) ((vectorp value) (add-exp `((let ((range ,value)) (if (>= index length) (return-from compare nil)) (if (= 1 (sbit range (char-code (char string index)))) (incf index) (return-from compare nil))))))) (incf eindex length))) (t ;; ;; We have a literal character. ;; Scan to see how many we have and if it is more than one ;; generate a string= verses as single eql. ;; (let* ((lit "") (term (dotimes (litindex (- (length source) eindex) nil) (let ((litchar (char source (+ eindex litindex)))) (if (position litchar *regex-special-chars*) (return litchar) (progn (info "Now adding ~A index ~A to lit~%" litchar litindex) (setf lit (concatenate 'string lit (string litchar))))))))) (if (= (length lit) 1) (add-exp `((if (not (and (< index length) (eql (char string index) ,current))) (return-from compare nil) (incf index)))) ;; ;; If we have a multi-character literal then we must ;; check to see if the next character (if there is one) ;; is an astrisk or a plus. If so then we must not use this ;; character in the big literal. (progn (if (or (eql term #\*) (eql term #\+)) (setf lit (subseq lit 0 (1- (length lit))))) (add-exp `((if (< length (+ index ,(length lit))) (return-from compare nil)) (if (not (string= string ,lit :start1 index :end1 (+ index ,(length lit)))) (return-from compare nil) (incf index ,(length lit))))))) (incf eindex (1- (length lit)))))))) ;; ;; Plug end of list to return t. If we made it this far then ;; We have matched! (add-exp '((setf (cadr (aref *regex-groups* 0)) index))) (add-exp '((return-from final-return t))) ;; ;;; (print expression) ;; ;; Now take the expression list and turn it into a lambda expression ;; replacing the special flags with lisp code. ;; For example: A BEGIN needs to be replace by an expression that ;; saves the current index, then evaluates everything till it gets to ;; the END then save the new index if it didn't fail. ;; On an ASTRISK I need to take the previous expression and wrap ;; it in a do that will evaluate the expression till an error ;; occurs and then another do that encompases the remainder of the ;; regular expression and iterates decrementing the index by one ;; of the matched expression sizes and then returns nil. After ;; the last expression insert a form that does a return t so that ;; if the entire nested sub-expression succeeds then the loop ;; is broken manually. ;; (setf result (copy-tree nil)) ;; ;; Reversing the current expression makes building up the ;; lambda list easier due to the nexting of expressions when ;; and astrisk has been encountered. (setf expression (reverse expression)) (do ((elt 0 (1+ elt))) ((>= elt (length expression))) (let ((piece (nth elt expression))) ;; ;; Now check for PLUS, if so then ditto the expression and then let the ;; ASTRISK below handle the rest. ;; (cond ((eql piece 'PLUS) (cond ((listp (nth (1+ elt) expression)) (setf result (append (list (nth (1+ elt) expression)) result))) ;; ;; duplicate the entire group ;; NOTE: This hasn't been implemented yet!! (t (format *standard-output* "GROUP repeat hasn't been implemented yet~%"))))) (cond ((listp piece) ;Just append the list (setf result (append (list piece) result))) ((eql piece 'QUESTION) ; Wrap it in a block that won't fail (cond ((listp (nth (1+ elt) expression)) (setf result (append `((progn (block compare ,(nth (1+ elt) expression)) t)) result)) (incf elt)) ;; ;; This is a QUESTION on an entire group which ;; hasn't been implemented yet!!! ;; (t (format *standard-output* "Optional groups not implemented yet~%")))) ((or (eql piece 'ASTRISK) ; Do the wild thing! (eql piece 'PLUS)) (cond ((listp (nth (1+ elt) expression)) ;; ;; This is a single character wild card so ;; do the simple form. ;; (setf result `((let ((oindex index)) (block compare (do () (nil) ,(nth (1+ elt) expression))) (do ((start index (1- start))) ((< start oindex) nil) (let ((index start)) (block compare ,@result)))))) (incf elt)) (t ;; ;; This is a subgroup repeated so I must build ;; the loop using several values. ;; )) ) (t t)))) ; Just ignore everything else. ;; ;; Now wrap the result in a lambda list that can then be ;; invoked or compiled, however the user wishes. ;; (if anchored (setf result `(lambda (string &key (start 0) (end (length string))) (setf *regex-groupings* ,group) (block final-return (block compare (let ((index start) (length end)) ,@result))))) (setf result `(lambda (string &key (start 0) (end (length string))) (declare (optimize (speed 3) (safety 0)) (type simple-string string) (type fixnum start end)) (setf *regex-groupings* ,group) (block final-return (let ((length end)) ,@fast-first (do ((marker start (1+ marker))) ((> marker end) nil) (declare (type fixnum marker)) (let ((index marker)) (declare (type fixnum index)) (if (block compare ,@result) (return t))))))))))) ;;; ;;; Define a function that will take a quoted character and return ;;; what the real character should be plus how much of the source ;;; string was used. If the result is a set of characters, return an ;;; array of bits indicating which characters should be set. If the ;;; expression is one of the sub-group matches return a ;;; list-expression that will provide the match. ;;; (defun regex-quoted (char-string &optional (invert nil)) "Usage: (regex-quoted &optional invert) Returns either the quoted character or a simple bit vector of bits set for the matching values" (let ((first (char char-string 0)) (result (char char-string 0)) (used-length 1)) (cond ((eql first #\n) (setf result #\NewLine)) ((eql first #\c) (setf result #\Return)) ((eql first #\t) (setf result #\Tab)) ((eql first #\d) (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) ((eql first #\D) (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) ((eql first #\w) (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) ((eql first #\W) (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) ((eql first #\b) (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) ((eql first #\B) (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) ((eql first #\s) (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) ((eql first #\S) (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) ((and (>= (char-code first) (char-code #\0)) (<= (char-code first) (char-code #\9))) (if (and (> (length char-string) 2) (and (>= (char-code (char char-string 1)) (char-code #\0)) (<= (char-code (char char-string 1)) (char-code #\9)) (>= (char-code (char char-string 2)) (char-code #\0)) (<= (char-code (char char-string 2)) (char-code #\9)))) ;; ;; It is a single character specified in octal ;; (progn (setf result (do ((x 0 (1+ x)) (return 0)) ((= x 2) return) (setf return (+ (* return 8) (- (char-code (char char-string x)) (char-code #\0)))))) (setf used-length 3)) ;; ;; We have a group number replacement. ;; (let ((group (- (char-code first) (char-code #\0)))) (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group)) (cadr (aref *regex-groups* ,group))))) (if (< length (+ index (length nstring))) (return-from compare nil)) (if (not (string= string nstring :start1 index :end1 (+ index (length nstring)))) (return-from compare nil) (incf index (length nstring))))))))) (t (setf result first))) (if (and (vectorp result) invert) (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t)) (values result used-length)))