1 | (ns com.github.kyleburton.sandbox.regex |
2 | (:import (java.util.regex Pattern Matcher)) |
3 | (:use [com.github.kyleburton.sandbox.utils :as kutils] |
4 | [com.github.kyleburton.sandbox.ref-data :as ref-data]) |
5 | (:use [clojure.contrib.str-utils :as str] |
6 | [clojure.contrib.fcase :only (case)])) |
7 |
|
8 |
|
9 | ;; regexes, initial set pulled from Regex::Common CPAN module |
10 | (def *common-regexes* |
11 | {:num-real #"(?-xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|)))" |
12 | :num-int #"(?-xism:(?:(?:[+-]?)(?:[0123456789]+)))" |
13 | :num-decimal #"(?-xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)))" |
14 | :num-hex #"(?-xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789ABCDEF]|[.])(?:[0123456789ABCDEF]*)(?:(?:[.])(?:[0123456789ABCDEF]{0,}))?)(?:(?:[G])(?:(?:[+-]?)(?:[0123456789ABCDEF]+))|)))" |
15 | :num-dec #"(?-xism:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|)))" |
16 | :num-oct #"(?-xism:(?:(?i)(?:[+-]?)(?:(?=[01234567]|[.])(?:[01234567]*)(?:(?:[.])(?:[01234567]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[01234567]+))|)))" |
17 | :num-bin #"(?-xism:(?:(?i)(?:[+-]?)(?:(?=[01]|[.])(?:[01]*)(?:(?:[.])(?:[01]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[01]+))|)))" |
18 | :num-roman #"(?-xism:(?xi)(?=[MDCLXVI]) |
19 | (?:M{0,3} |
20 | (D?C{0,3}|CD|CM)? |
21 | (L?X{0,3}|XL|XC)? |
22 | (V?I{0,3}|IV|IX)?))" |
23 | :zip #"(?-xism:(?:(?:(?:USA?)-){0,1}(?:(?:(?:[0-9]{3})(?:[0-9]{2}))(?:(?:-)(?:(?:[0-9]{2})(?:[0-9]{2}))){0,1})))" |
24 | :net-ipv4 #"(?-xism:(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})))" |
25 | :net-mac #"(?-xism:(?:(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2}):(?:[0-9a-fA-F]{1,2})))" |
26 | :net-domain #"(?-xism:(?: |(?:[A-Za-z](?:(?:[-A-Za-z0-9]){0,61}[A-Za-z0-9])?(?:\.[A-Za-z](?:(?:[-A-Za-z0-9]){0,61}[A-Za-z0-9])?)*)))" |
27 | :phone #"(?:1[- ]?)?\(?[2-9]\d{2}\)?[-\. ]?\d{3}[-\. ]?\d{4}(?:\s*(?:e|ex|ext|x|xtn|extension)?\s*\d*)" |
28 | :us-states (Pattern/compile (format "(?-xism:%s)" (str/str-join "|" (keys ref-data/*us-states*)))) |
29 | :us-state-names (Pattern/compile (format "(?-xism:%s)" (str/str-join "|" (vals ref-data/*us-states*)))) |
30 | :us-airport-codes (Pattern/compile (format "(?-xism:%s)" (str/str-join "|" (map #(nth % 2) ref-data/*us-airport-codes*)))) |
31 | :us-area-codes (Pattern/compile (format "(?-xism:%s)" (str/str-join "|" ref-data/*us-area-codes*))) |
32 |
|
33 |
|
34 | :word #"(?:[\w-]+)" |
35 | :punctuation #"(?:[\.,\?/'\";:\\`~!\(\)]+)" |
36 | }) |
37 |
|
38 | ;; (re-find (:us-airport-codes *common-regexes*) "foo PHL bar") |
39 | ;; (re-find (:us-area-codes *common-regexes*) "foo 484 bar") |
40 | ;; (re-find (:word *common-regexes*) "foo 484 bar") |
41 | ;; (re-find (:punctuation *common-regexes*) "foo 484 , bar") |
42 |
|
43 |
|
44 | ;; (:us-state-names *common-regexes*) |
45 | ;; (:us-states *common-regexes*) |
46 | ;; (re-matches (:zip *common-regexes*) "19087") |
47 |
|
48 |
|
49 |
|
50 | ;; (re-matches (:phone *common-regexes*) "1 (610) 940 4002 x 116") |
51 | ;; (re-matches (:phone *common-regexes*) "1 (610) 940 4002x116") |
52 | ;; (re-matches (:phone *common-regexes*) "(610) 940 4002 x 116") |
53 | ;; (re-matches (:phone *common-regexes*) "610.940.4002") |
54 | ;; (re-matches (:phone *common-regexes*) "610.940.4002") |
55 | ;; (re-matches (:phone *common-regexes*) "1610.940.4002") |
56 | ;; (re-matches (:phone *common-regexes*) "1-610.940.4002") |
57 | ;; (re-matches (:phone *common-regexes*) "1 610.940.4002") |
58 |
|