(in-package :landmark-parser) ;; (require 'asdf-install) ;; (asdf-install:install :drakma) ; (require 'drakma) (require 'cl-ppcre) (defvar *landmark-command-dispatch-table* nil) (setf *landmark-command-dispatch-table* (list)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Utilities that should probably be moved out... (defun starts-with (pat str) (cond ((= 0 (length str)) nil) ((> (length pat) (length str)) nil) ((string= pat (subseq str 0 (length pat))) t) (t nil))) (assert (not (starts-with "foo" ""))) (assert (starts-with "foo" "foo")) (assert (starts-with "foo" "foobar")) (assert (not (starts-with "foo" "ofobar"))) (defun ends-with (pat str) (cond ((= 0 (length str)) nil) ((> (length pat) (length str)) nil) ((string= pat (subseq str (- (length str ) (length pat)))) t) (t nil))) (assert (not (ends-with "foo" ""))) (assert (ends-with "foo" "foo")) (assert (ends-with "foo" "barfoo")) (assert (not (ends-with "foo" "ofobar"))) (assert (not (ends-with "foo" "barofo"))) (defun link-trimmer (link) (cond ((starts-with "\"" link) (link-trimmer (subseq link 1))) ((starts-with "'" link) (link-trimmer (subseq link 1))) ((ends-with "\"" link) (link-trimmer (subseq link 0 (- (length link) 1)))) ((ends-with "'" link) (link-trimmer (subseq link 0 (- (length link) 1)))) (t link))) (assert (string= "foo" (link-trimmer "\"foo"))) (assert (string= "foo" (link-trimmer "foo\""))) (assert (string= "foo" (link-trimmer "\"foo\""))) (defun get-all-links (html) (let ((parser (landmark-parser:new-parser html))) (mapcar #'link-trimmer (landmark-parser:extract-all parser '((forward-to "" "\"" "'"))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun remove-command (name) (setf *landmark-command-dispatch-table* (remove-if #'(lambda (elt) (cond ((equal name (car elt)) ; (format t "remove-command: dupe") t) (t nil))) *landmark-command-dispatch-table*))) (defun add-alias (name fn) (remove-command name) (push (cons name fn) *landmark-command-dispatch-table*)) (defmacro defcommand (name lambda-list &rest body) `(progn (defun ,name ,(cons 'parser lambda-list) ,@body) (remove-command ',name) (push (cons ',name #',name) *landmark-command-dispatch-table*))) (defstruct (parser) document (lc-document nil) (current-pos 0)) (defun new-parser (document) (make-parser :document document :lc-document (string-downcase document) :current-pos 0)) (defcommand forward (cnt) (let ((pos (+ (parser-current-pos parser) cnt))) (cond ((< pos 0) nil) ((< pos (length (parser-lc-document parser))) (setf (parser-current-pos parser) pos) t) (t nil)))) (defcommand backward (cnt) (let ((pos (- (parser-current-pos parser) cnt))) (cond ((< pos 0) nil) ((< pos (length (parser-lc-document parser))) (setf (parser-current-pos parser) pos) t) (t nil)))) (defcommand forward-to (landmark) ; (format t "forward-to: lmark:~a dc:~a~&" landmark (string-downcase landmark)) (let ((pos (search (string-downcase landmark) (parser-lc-document parser) :start2 (parser-current-pos parser)))) (cond (pos (setf (parser-current-pos parser) pos) t) (t nil)))) (defcommand forward-past (landmark) (let ((pos (search (string-downcase landmark) (parser-lc-document parser) :start2 (parser-current-pos parser)))) (cond (pos (setf (parser-current-pos parser) (+ pos (length landmark))) t) (t nil)))) (defcommand rewind-to (landmark) (let ((pos (search (string-downcase landmark) (parser-lc-document parser) :from-end t :end2 (parser-current-pos parser)))) (cond (pos (setf (parser-current-pos parser) (+ pos (length landmark))) t) (t nil)))) (defcommand rewind-past (landmark) (let ((pos (search (string-downcase landmark) (parser-lc-document parser) :from-end t :end2 (parser-current-pos parser)))) (cond (pos (setf (parser-current-pos parser) pos) t) (t nil)))) (defcommand set-position (pos) (cond ((< pos 0) nil) ((< pos (length (parser-lc-document parser))) (setf (parser-current-pos parser) pos) t) (t nil))) (defun find-command-function (symbol) ; (format t "lookingfor ~a in ~a~&" ; symbol ; (mapcar #'car *landmark-command-dispatch-table*)) (loop for (name . fn) in *landmark-command-dispatch-table* with ret = nil finally (return ret) do (when (equal (symbol-name symbol) (symbol-name name)) (setf ret fn)))) (defcommand apply-commands (commands) (cond ((null commands) t) (t (let* ((form (first commands)) (command (first form)) (args (rest form)) (fn (find-command-function command))) ; (format t "form:~a command:~a args:~a fn:~a~&" ; form command args fn) (if (apply fn (cons parser args)) (apply-commands parser (rest commands)) nil))))) (defcommand extract (from &optional to) (subseq (parser-document parser) from to)) (defcommand extract-between (start-cmds end-cmds) (let ((orig (parser-current-pos parser))) (if (apply-commands parser start-cmds) (let ((beg (parser-current-pos parser))) (if (apply-commands parser end-cmds) (extract parser beg (parser-current-pos parser)) (progn (setf (parser-current-pos parser) orig) nil))) (progn (setf (parser-current-pos parser) orig) nil)))) (defcommand extract-from-to (from to) (let ((orig (parser-current-pos parser))) (if (forward-past parser from) (let ((beg (parser-current-pos parser))) (if (forward-to parser to) (extract parser beg (parser-current-pos parser)) (progn (setf (parser-current-pos parser) orig) nil))) (progn (setf (parser-current-pos parser) orig) nil)))) (defcommand extract-all (start-cmds end-cmds &optional (results (list))) (if (apply-commands parser start-cmds) (let ((beg (parser-current-pos parser))) (if (apply-commands parser end-cmds) (extract-all parser start-cmds end-cmds (cons (extract parser beg (parser-current-pos parser)) results)) (reverse results))) (reverse results))) (defcommand forward-to-first-of (landmarks) (let ((mpos nil) (lmark nil)) (loop for landmark in landmarks do (let ((pos (search (string-downcase landmark) (parser-lc-document parser) :start2 (parser-current-pos parser)))) ; (format t "mpos=~a pos=~a lmark:~a~&" mpos pos landmark) (cond ((null mpos) (setf lmark landmark) (setf mpos pos)) ((and pos (< pos mpos)) (setf lmark landmark) (setf mpos pos)) (t (setf mpos mpos))))) (cond (mpos (setf (parser-current-pos parser) mpos) (values t lmark)) (t (values nil lmark))))) (defcommand forward-past-first-of (landmarks) (multiple-value-bind (foundp landmark) (forward-to-first-of parser landmarks) (cond (foundp (setf (parser-current-pos parser) (+ (parser-current-pos parser) (length landmark))) (values t landmark)) (t (values nil landmark))))) (defcommand reset-parser () (setf (parser-current-pos parser) 0)) (defcommand forward-to-regex (regex) (multiple-value-bind (match-start) (cl-ppcre:scan regex (parser-lc-document parser) :start (parser-current-pos parser)) (cond (match-start (setf (parser-current-pos parser) match-start) t) (t nil)))) (defcommand forward-past-regex (regex) (multiple-value-bind (match-start match-end) (cl-ppcre:scan regex (parser-lc-document parser) :start (parser-current-pos parser)) (cond (match-start (setf (parser-current-pos parser) match-end) t) (t nil)))) (defmacro aprog1 (init &body body) "Anaphoric prog1 - binds the symbol 'it to the result of evaluating the first form, subsequent forms are evaluated for effect, just as with prog1. My favorite use for this is in-place construction of hash-tables: (aprog1 (make-hash-table :test #'equal) (setf (gethash it :a) 1) (setf (gethash it :b) 2) (setf (gethash it :c) 3)) " `(let ((it ,init)) ,@body it)) (assert (equal 0 (let ((p (new-parser ""))) (forward-past p " ") (parser-current-pos p)))) (assert (equal 4 (let ((p (new-parser "foo bar qux baz bla"))) (forward-past p " ") (parser-current-pos p)))) (assert (equal 4 (let ((p (new-parser "foo bar qux baz bla"))) (forward-to p "BAR") (parser-current-pos p)))) (assert (equal 0 (let ((p (new-parser "foo bar qux baz bla"))) (forward-past p "zzz") (parser-current-pos p)))) (assert (equal 3 (let ((p (new-parser "foo bar qux baz bla"))) (forward-to p " ") (parser-current-pos p)))) (assert (equal 0 (let ((p (new-parser "abcdef"))) (forward-to p "F") (reset-parser p) (parser-current-pos p)))) (assert (equal "cd" (let ((p (new-parser "abcdef"))) (extract-from-to p "b" "e")))) (assert (equal "cd" (let ((p (new-parser "abcdef"))) (extract-between p '((forward-past "b")) '((forward-to "e")))))) (assert (equal 16 (let ((p (new-parser "foo bar qux baz bla doo dee dah doh dnh"))) (forward-to-regex p "b[^a]") (parser-current-pos p)))) (assert (equal 7 (let ((p (new-parser "foo bar qux baz bla doo dee dah doh dnh"))) (forward-past-first-of p '("qux" "bar")) (parser-current-pos p)))) (assert (equal 4 (let ((p (new-parser "foo bar qux baz bla doo dee dah doh dnh"))) (forward-to-first-of p '("qux" "bar")) (parser-current-pos p)))) (assert (equal 3 (let ((p (new-parser "foo bar qux baz bla doo dee dah doh dnh"))) (forward p 3) (parser-current-pos p)))) (assert (not (let ((p (new-parser "foo bar qux baz bla doo dee dah doh dnh"))) (forward p 9999)))) (assert (equal 2 (let ((p (new-parser "foo bar qux baz bla doo dee dah doh dnh"))) (forward p 3) (backward p 1) (parser-current-pos p)))) (assert (not (let ((p (new-parser "foo bar qux baz bla doo dee dah doh dnh"))) (forward p 3) (backward p 99999)))) (assert (equal 3 (let ((p (new-parser "foo bar qux"))) (set-position p 3) (parser-current-pos p)))) (assert (not (let ((p (new-parser "foo bar qux"))) (set-position p -3)))) (assert (equal "foo" (let ((p (new-parser "foo bar qux"))) (extract p 0 3)))) (assert (equal 3 (let ((p (new-parser "foo bar qux"))) (forward-to p "qux") (rewind-to p "foo") (parser-current-pos p)))) (assert (equal 0 (let ((p (new-parser "foo bar qux"))) (forward-to p "qux") (rewind-past p "foo") (parser-current-pos p)))) (assert (equal '("a" "b" "c") (let ((p (new-parser " a b c "))) (extract-all p '((forward-past " ")) '((forward-to " ")))))) (assert (equal 0 (let ((p (new-parser " a b c "))) (apply-commands p '((forward-past "a") (forward-past "c") (rewind-past "a") (rewind-past " "))) (parser-current-pos p)))) ;; should we add these enhancements? ;; backward-to-first-of ;; backward-past-first-of ;; backward-to-regex ;; (mapcar #'car *landmark-command-dispatch-table*) ;; need a 'consume' operation, that moves forward