1 | (ns com.github.kyleburton.sandbox.landmark-parser |
2 | (:import (java.util.regex Pattern Matcher)) |
3 | (:use [com.github.kyleburton.sandbox.utils :as kutils]) |
4 | (:use [com.github.kyleburton.sandbox.regex :as regex-util]) |
5 | (:use [clojure.contrib.str-utils :as str] |
6 | [clojure.contrib.fcase :only (case)])) |
7 |
|
8 | (def *cmds*) |
9 |
|
10 | (defstruct parser :pos :doc :ldoc :doclen) |
11 |
|
12 | (defn make-parser [#^String doc] |
13 | (struct-map parser |
14 | :pos (atom 0) |
15 | :ldoc (.toLowerCase doc) |
16 | :doclen (.length doc) |
17 | :doc doc)) |
18 |
|
19 | (defn forward-past [parser landmark] |
20 | (let [pos (.indexOf (:ldoc parser) (.toLowerCase landmark) @(:pos parser))] |
21 | (if (= -1 pos) |
22 | false |
23 | (do |
24 | (reset! (:pos parser) (+ pos (count landmark))) |
25 | @(:pos parser))))) |
26 |
|
27 | (defn forward-to [parser landmark] |
28 | (let [start (:post parser) |
29 | pos (.indexOf (:ldoc parser) (.toLowerCase landmark) @(:pos parser))] |
30 | (if (= -1 pos) |
31 | false |
32 | (do |
33 | (reset! (:pos parser) pos) |
34 | @(:pos parser))))) |
35 |
|
36 | (defn set-pos! [parser pos] |
37 | (if (or (> pos (:doclen parser)) |
38 | (< pos 0)) |
39 | false |
40 | (do |
41 | (reset! (:pos parser) pos) |
42 | true))) |
43 |
|
44 | (defn forward [parser cnt] |
45 | (let [pos (+ cnt @(:pos parser))] |
46 | (if (> pos (:doclen parser)) |
47 | false |
48 | (do |
49 | (reset! (:pos parser) pos) |
50 | true)))) |
51 |
|
52 | (defn rewind [parser cnt] |
53 | (let [pos (- @(:pos parser) cnt)] |
54 | (if (< pos 0) |
55 | false |
56 | (do |
57 | (reset! (:pos parser) pos) |
58 | true)))) |
59 |
|
60 |
|
61 | (defn rewind-to [p landmark] |
62 | (let [pos (.lastIndexOf (:ldoc p) |
63 | (.toLowerCase landmark) |
64 | @(:pos p))] |
65 | (if (= -1 pos) |
66 | false |
67 | (do |
68 | (reset! (:pos p) (+ pos (count landmark))) |
69 | @(:pos p))))) |
70 |
|
71 | (defn rewind-past [p landmark] |
72 | (let [pos (.lastIndexOf (:ldoc p) |
73 | (.toLowerCase landmark) |
74 | @(:pos p))] |
75 | (if (= -1 pos) |
76 | false |
77 | (do |
78 | (reset! (:pos p) pos) |
79 | @(:pos p))))) |
80 |
|
81 | ;; support either '((:fp "foo") (:fp "bar")) |
82 | ;; or '(:fp "foo" :fp "bar") |
83 | (defn parse-cmds [cmds] |
84 | (cond (and (seq? cmds) |
85 | (seq? (first cmds)) |
86 | (= 2 (count (first cmds)))) |
87 | cmds |
88 | (= 1 (mod (count cmds) 2)) |
89 | (kutils/raise (format "parse-cmds: error, odd number of commands (expected even, symbol/landmark): cmds=%s" cmds)) |
90 | true |
91 | (partition 2 cmds))) |
92 |
|
93 | (defn apply-commands [parser & cmds] |
94 | (loop [[[cmd & args] & cmds] (parse-cmds cmds)] |
95 | (if cmd |
96 | (do |
97 | ;(prn (format "cmd=%s args=%s" cmd args)) |
98 | (if (apply (*cmds* cmd) (cons parser args)) |
99 | (do |
100 | (recur cmds)) |
101 | false)) |
102 | true))) |
103 |
|
104 | (defn do-commands [parser cmds] |
105 | ;; (prn "do-commands: cmds=" cmds) |
106 | (loop [[[cmd & args] & cmds] (parse-cmds cmds)] |
107 | (if cmd |
108 | (do |
109 | ;(prn (format "pos:%d cmd=%s args=%s" @(:pos parser) cmd args)) |
110 | (if (apply (*cmds* cmd) (cons parser args)) |
111 | (do |
112 | ;(prn (format "pos:%d cmd=%s args=%s" @(:pos parser) cmd args)) |
113 | (recur cmds)) |
114 | false)) |
115 | true))) |
116 |
|
117 | (defn forward-past-regex |
118 | "See also regex-util/*common-regexes*" |
119 | [p regex] |
120 | (kutils/log "forward-past-regex regex=%s" regex) |
121 | (let [pat (if (and (keyword? regex) (regex regex-util/*common-regexes*)) |
122 | (regex regex-util/*common-regexes*) |
123 | (Pattern/compile (str regex) (bit-or Pattern/MULTILINE Pattern/CASE_INSENSITIVE))) |
124 | m (.matcher pat (:doc p))] |
125 | (kutils/log "forward-past-regex: pat=%s m=%s" pat m) |
126 | (if (.find m @(:pos p)) |
127 | (do |
128 | (kutils/log "forward-past-regex: found reg:%s at:(%d,%d,)" regex (.start m) (.end m)) |
129 | (reset! (:pos p) (.end m)) |
130 | @(:pos p)) |
131 | false))) |
132 |
|
133 | (defn forward-to-regex [p regex] |
134 | "See also regex-util/*common-regexes*" |
135 | (let [pat (if (and (keyword? regex) (regex regex-util/*common-regexes*)) |
136 | (regex regex-util/*common-regexes*) |
137 | (Pattern/compile (str regex) (bit-or Pattern/MULTILINE Pattern/CASE_INSENSITIVE))) |
138 | m (.matcher pat (:doc p))] |
139 | (kutils/log "forward-to-regex: using pat=%s" pat) |
140 | (if (.find m @(:pos p)) |
141 | (do |
142 | (reset! (:pos p) (.start m)) |
143 | @(:pos p)) |
144 | false))) |
145 |
|
146 | (def *cmds* |
147 | {:apply-commands apply-commands |
148 | :a apply-commands |
149 | :do-commands do-commands |
150 | :d do-commands |
151 | :forward forward |
152 | :f forward |
153 | :forward-past forward-past |
154 | :fp forward-past |
155 | :forward-past-regex forward-past-regex |
156 | :fpr forward-past-regex |
157 | :forward-to forward-to |
158 | :ft forward-to |
159 | :forward-to-regex forward-to-regex |
160 | :ftr forward-to-regex |
161 | :rewind rewind |
162 | :r rewind |
163 | :rewind-to rewind-to |
164 | :rt rewind-to |
165 | :rewind-past rewind-past |
166 | :rp rewind-past}) |
167 |
|
168 |
|
169 | (defn doc-substr [parser cnt] |
170 | (.substring (:doc parser) |
171 | @(:pos parser) |
172 | (+ @(:pos parser) |
173 | cnt))) |
174 |
|
175 | (defn extract [p start-cmds end-cmds] |
176 | (let [orig-pos @(:pos p)] |
177 | ;(prn (format "running start-cmds from:%d looking for %s" orig-pos start-cmds)) |
178 | (if (do-commands p start-cmds) |
179 | (let [spos @(:pos p)] |
180 | ;(prn (format "found start at:%d looking for end %s" spos end-cmds)) |
181 | (if (do-commands p end-cmds) |
182 | (.substring (:doc p) |
183 | spos |
184 | @(:pos p)) |
185 | (do (set-pos! p orig-pos) |
186 | false))) |
187 | (do (set-pos! p orig-pos) |
188 | false)))) |
189 |
|
190 | (defn extract-from [html start-cmds end-cmds] |
191 | (extract (make-parser html) start-cmds end-cmds)) |
192 |
|
193 |
|
194 | (defn extract-all [p start-cmds end-cmds] |
195 | (loop [res []] |
196 | (if (do-commands p start-cmds) |
197 | (let [spos @(:pos p)] |
198 | (if (do-commands p end-cmds) |
199 | (recur (conj res (.substring (:doc p) spos @(:pos p)))) |
200 | res)) |
201 | res))) |
202 |
|
203 | (defn extract-all-from [html start-cmds end-cmds] |
204 | (extract-all (make-parser html) start-cmds end-cmds)) |
205 |
|
206 | (defn table-rows [html] |
207 | (extract-all-from html |
208 | '(:ft "<tr") |
209 | '(:fp "</tr"))) |
210 |
|
211 | (defn row->cells [html] |
212 | (extract-all-from html |
213 | '(:fp "<td" :fp ">") |
214 | '(:ft "</td>"))) |
215 |
|
216 | (defn html->links [html] |
217 | (extract-all-from html |
218 | '(:fp "href=\"") |
219 | '(:ft "\""))) |
220 |
|
221 | (defn html->anchors [html] |
222 | (extract-all-from html |
223 | '(:ft "<a ") |
224 | '(:fp "</a>"))) |
225 |
|
226 | (defn anchor->href [html] |
227 | (first (kutils/re-find-first #"href=\"([^\"]+)\"" html))) |
228 |
|
229 | (defn html-find-link-with-body [html text] |
230 | (first |
231 | (kutils/re-find-first |
232 | #"href=\"([^\"]+)\"" |
233 | (first |
234 | (filter #(.contains % text) |
235 | (html->anchors html)))))) |
236 |
|
237 |
|
238 | (defn html->tables [html] |
239 | (extract-all-from html |
240 | '(:ft "<table") |
241 | '(:fp "</table>"))) |
242 |
|
243 | (defn html-table->matrix [html] |
244 | (map row->cells (table-rows html))) |
245 |
|
246 | (defn html->form-blocks [html] |
247 | (extract-all-from html |
248 | '(:ft "<form") |
249 | '(:fp "</form>"))) |
250 |
|
251 |
|
252 | ;; (def p (make-parser (com.github.kyleburton.sandbox.web/get->string "http://asymmetrical-view.com/"))) |
253 | ;; (forward-past-regex p :num-real) |
254 | ;; (forward-to-regex p #"\d{4}") |
255 |
|
256 | ;; (def pat (Pattern/compile (str #"\d{4}") (bit-or Pattern/MULTILINE Pattern/CASE_INSENSITIVE))) |
257 | ;; (def m (.matcher pat (:doc p))) |
258 |
|
259 |
|
260 | ;; (html->links (com.github.kyleburton.sandbox.web/get->string "http://asymmetrical-view.com/")) |
261 |
|
262 | (defn parse-input-element [html] |
263 | {:tag :input |
264 | :type (first (kutils/re-find-first "(?-ims:type=\"([^\"]+)\")" html)) |
265 | :name (first (kutils/re-find-first "(?-ims:name=\"([^\"]+)\")" html)) |
266 | :value (first (kutils/re-find-first "(?-ims:value=\"([^\"]+)\")" html)) |
267 | }) |
268 |
|
269 | ;; This technique won't work reliably...need to implement :forward-to-first-of '(:ft "<input") '(:ftfo "/>" "</input>" |
270 | ;; TODO: parse out textarea, button and select |
271 | (defn parse-form-elements [html] |
272 | (apply concat [(map parse-input-element (extract-all-from html '(:ft "<input") '(:fp ">"))) |
273 | ;; (extract-all-from html '(:ft "<textarea") '(:fp "</textarea>")) |
274 | ;; (extract-all-from html '(:ft "<button") '(:fp ">")) |
275 | ;; (extract-all-from html '(:ft "<select") '(:fp "</select>")) |
276 | ])) |
277 |
|
278 | ;;(parse-form-elements (first (html->form-blocks com.github.kyleburton.sandbox.web/html))) |
279 |
|
280 | (defn parse-form [html] |
281 | {:method (or (first (kutils/re-find-first "(?-ims:method=\"([^\"]+)\")" html)) |
282 | "GET") |
283 | :action (or (first (kutils/re-find-first "(?-ims:action=\"([^\"]+)\")" html)) |
284 | nil) |
285 | :params (vec (parse-form-elements html)) |
286 | }) |
287 |
|