;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: TINY-REGEX; -*- ;; --------------------------------------------------------------------------- ;; Title: Tiny Regex Implementation (for fun) ;; Created: 2012-11-19 ;; Author: Gilbert Baumann ;; License: MIT style (see below) ;; --------------------------------------------------------------------------- ;; (c) copyright 2012 by Gilbert Baumann ;; Permission is hereby granted, free of charge, to any person obtaining ;; a copy of this software and associated documentation files (the ;; "Software"), to deal in the Software without restriction, including ;; without limitation the rights to use, copy, modify, merge, publish, ;; distribute, sublicense, and/or sell copies of the Software, and to ;; permit persons to whom the Software is furnished to do so, subject to ;; the following conditions: ;; ;; The above copyright notice and this permission notice shall be ;; included in all copies or substantial portions of the Software. ;; ;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. (defpackage :TINY-REGEX) (in-package :TINY-REGEX) ;;;; -- Overview ------------------------------------------------------------------------------ ;; This is a tiny implementation of a regular expression matcher with ;; PCRE-Semantics. The focus was onto getting this implementation ;; tiny and correct and beatiful for some metric. ;; The focus not so much was on speed. Though as being a backtracking ;; implementation it should fare well amoung other [primitive] ;; backtracking implementations. ;; The supported syntax is: ;; regex ::= concatenation | regex "|" concatenation ;; concatenation ::= star | concatenation star ;; star ::= single { "*" | "+" | "?" }* ;; single ::= "[" ... "]" | "(" regex ")" | char ;; The central idea behind this implementation is, that a regular ;; expression could be seen as a function, which applied to an input ;; calls a continuation with the remaining input and the captured ;; variables. The continuation is invoked repeatly in the order of ;; best to worst solution. ;; Such a function takes the input string, the current reading pointer ;; and the continuation as arguments. For instance the regex "a" is ;; translated to: ;; (lambda (input ptr cont) ;; (when (and (< ptr (length input)) ;; (char= (char input ptr) #\a)) ;; (funcall cont (1+ ptr) nil) ;; For each of the regex operations (catenation, alternation, ;; iteration) it is now possible to define a functor, which does the ;; right thing. Within the PARSE function, we do not bulid an AST but ;; the construct the correct function. This is what keeps the LoC ;; count low. ;; When matching a regex, we operate brute force. We try the regex at ;; position 0, then at position 1 and so on. We implement anchoring, ;; but anchoring conditions are only tested while evaluating a regex. ;;;; -- Scanner Functors ---------------------------------------------------------------------- (defun scanner-if (predicate) (lambda (input ptr cont) (and (< ptr (length input)) (funcall predicate (char input ptr)) (funcall cont (1+ ptr) nil)))) (defun scan-bol (input ptr cont) (declare (ignore input)) (and (= ptr 0) (funcall cont ptr nil))) (defun scan-eol (input ptr cont) (and (= ptr (length input)) (funcall cont ptr nil))) (defun scan-epsilon (input ptr cont) (declare (ignore input)) (funcall cont ptr nil)) (defun scanner/and (&rest xs) (reduce #'(lambda (lhs rhs) (lambda (input ptr cont) (funcall lhs input ptr (lambda (ptr-1 val-1) (funcall rhs input ptr-1 (lambda (ptr-2 val-2) (funcall cont ptr-2 (compose-values val-1 val-2)))))))) xs :from-end t :initial-value #'scan-epsilon)) (defun scanner/or (&rest xs) (lambda (input ptr cont) (loop for x in xs do (funcall x input ptr cont)))) (defun scanner/setq (var sub) (lambda (input ptr cont) (funcall sub input ptr (lambda (ptr-1 val-1) (funcall cont ptr-1 (compose-values (list (list var ptr ptr-1)) val-1)))))) (defun scanner/iter (sub) (lambda (input ptr cont) (labels ((greedy (ptr-1 val-1 trail) (unless (member ptr-1 trail) (funcall sub input ptr-1 (lambda (ptr-2 val-2) (greedy ptr-2 (compose-values val-1 val-2) (cons ptr-1 trail))))) (funcall cont ptr-1 val-1))) (greedy ptr nil nil)))) (defun compose-values (v1 v2) (append (remove-if (lambda (x) (find (car x) v2 :key #'car)) v1) v2)) ;;;; -- Parser -------------------------------------------------------------------------------- (defun parse-regex (string) (with-input-from-string (input string) (let ((stack nil) (cur-and nil) (cur-or nil) (group-number 0)) (loop for c = (read-char input nil nil) while c do (case c (#\| (push (apply #'scanner/and (reverse cur-and)) cur-or) (setf cur-and nil)) (#\* (setf (car cur-and) (scanner/iter (car cur-and)))) (#\? (setf (car cur-and) (scanner/or (car cur-and) (scanner/and)))) (#\+ (setf (car cur-and) (scanner/and (car cur-and) (scanner/iter (car cur-and))))) (#\( (push cur-or stack) (push cur-and stack) (push (incf group-number) stack) (setf cur-and nil cur-or nil)) (#\) (unless stack (error "Extra ')'")) (push (apply #'scanner/and (reverse cur-and)) cur-or) (setf cur-and (cons (scanner/setq (pop stack) (apply #'scanner/or (reverse cur-or))) (pop stack))) (setf cur-or (pop stack))) (#\. (push (scanner-if #'identity) cur-and)) (#\^ (push #'scan-bol cur-and)) (#\$ (push #'scan-eol cur-and)) (#\[ (push (scanner-if (parse-set input)) cur-and)) (#\\ (push (let ((c (read-char input))) (scanner-if (lambda (x) (char= c x)))) cur-and)) (otherwise (push (let ((c c)) (scanner-if (lambda (x) (char= c x)))) cur-and)))) (when stack (error "Missing ')'.")) (push (apply #'scanner/and (reverse cur-and)) cur-or) (values (scanner/setq 0 (apply #'scanner/or (reverse cur-or))) group-number)))) (defun parse-set (input) (let ((negatedp #'identity) (xs #'false)) (loop for i from 0 for c = (read-char input) do (cond ((and (eql c #\]) (not (zerop i))) (return)) ((and (eql c #\^) (zerop i)) (setf negatedp #'complement)) ((eql #\- (peek-char nil input nil nil)) (read-char input) (setf xs (let ((c c) (xs xs) (d (read-char input))) (lambda (x) (or (funcall xs x) (char<= c x d)))))) (t (setf xs (let ((c c) (xs xs)) (lambda (x) (or (funcall xs x) (char= x c)))))))) (funcall negatedp xs))) ;;;; -- Driver -------------------------------------------------------------------------------- (defun scan (expr input) (multiple-value-bind (fun ngroup) (parse-regex expr) (loop for i from 0 to (length input) do (funcall fun input i (lambda (ptr value) (declare (ignore ptr)) (return-from scan (loop for v from 0 to ngroup collect (or (cdr (assoc v value)) '(nil nil))))))) :nomatch))