1 | (ns com.github.kyleburton.sandbox.tiger |
2 | (:import (org.apache.commons.net.ftp FTP FTPClient)) |
3 | (:require [com.github.kyleburton.sandbox.ftp :as ftp] |
4 | [com.github.kyleburton.sandbox.landmark-parser :as lparse] |
5 | [com.github.kyleburton.sandbox.utils :as kutils]) |
6 | (:use [clojure.contrib.str-utils :as str] |
7 | [clojure.contrib.fcase :only (case)])) |
8 |
|
9 | (def *tiger-ftp-url* "ftp://anonymous:user%40host.com@ftp2.census.gov/geo/tiger/TIGER2008/") |
10 |
|
11 | (ftp/list-all *tiger-ftp-url*) |
12 | (ftp/list-files *tiger-ftp-url*) |
13 | (ftp/list-directories *tiger-ftp-url*) |
14 |
|
15 | '(let [data-dir (kutils/expand-file-name "~/data-sets/tiger-line/data") |
16 | state "PENNSYLVANIA" |
17 | url (str *tiger-ftp-url* "42_" state)] |
18 | (.mkdirs (java.io.File. data-dir state)) |
19 | (dorun (doseq [file (ftp/list-files url)] |
20 | (let [local-file (str data-dir "/" state "/" file)] |
21 | (if (not (.exists (java.io.File. local-file))) |
22 | (do |
23 | (prn (format "fetch url %s => %s" url file local-file)) |
24 | (ftp/retrieve-file url |
25 | file |
26 | local-file)) |
27 | (prn (format "already have: %s" local-file))))))) |
28 |
|
29 | '(let [data-dir (kutils/expand-file-name "~/data-sets/tiger-line/data") |
30 | state "PENNSYLVANIA/42045_Delaware_County" |
31 | url (str *tiger-ftp-url* "42_" state)] |
32 | (.mkdirs (java.io.File. data-dir state)) |
33 | (dorun (doseq [file (ftp/list-files url)] |
34 | (let [local-file (str data-dir "/" state "/" file)] |
35 | (if (not (.exists (java.io.File. local-file))) |
36 | (do |
37 | (prn (format "fetch url %s => %s" url file local-file)) |
38 | (ftp/retrieve-file url |
39 | file |
40 | local-file)) |
41 | (prn (format "already have: %s" local-file))))))) |
42 |
|
43 | ;; (ftp/retrieve-file *tiger-ftp-url* "tl_2008_us_nectadiv.zip" "/tmp/tl_2008_us_nectadiv.zip") |
44 |
|
45 | (defmulti enumeration->seq class) |
46 |
|
47 | (defmethod enumeration->seq java.util.Enumeration [enum] |
48 | (loop [res []] |
49 | (if (.hasMoreElements enum) |
50 | (recur (conj res (.nextElement enum))) |
51 | (seq res)))) |
52 |
|
53 | (defmethod enumeration->seq java.util.zip.ZipFile [zfile] |
54 | (enumeration->seq (.entries zfile))) |
55 |
|
56 | ;; (enumeration->seq (.entries (java.util.zip.ZipFile. (kutils/expand-file-name "~/data-sets/tiger-line/data/PENNSYLVANIA/tl_2008_42_bg00.zip")))) |
57 | ;; (enumeration->seq (java.util.zip.ZipFile. (kutils/expand-file-name "~/data-sets/tiger-line/data/PENNSYLVANIA/tl_2008_42_bg00.zip"))) |
58 |
|
59 | ;; (map #(.getName %) (enumeration->seq (java.util.zip.ZipFile. (kutils/expand-file-name "~/data-sets/tiger-line/data/PENNSYLVANIA/tl_2008_42_bg00.zip")))) |
60 |
|
61 | ;; (def dbf (org.xBaseJ.DBF. (kutils/expand-file-name "/data-sets/tiger-line/data/tmp/tl_2008_us_state.dbf"))) |
62 |
|
63 | ;; (doseq [ii (range 1 (+ 1 (.getFieldCount dbf)))] |
64 | ;; (prn (format "field[%d]: %s" ii (.getName (.getField dbf ii))))) |
65 |
|
66 | ;; (.getRecordCount dbf) |
67 |
|
68 |
|
69 | (defn fields [#^org.xBaseJ.DBF dbf] |
70 | (for [ii (range 1 (+ 1 (.getFieldCount dbf)))] |
71 | (.getField dbf ii))) |
72 |
|
73 | (defn for-each-row [fn #^org.xBaseJ.DBF dbf] |
74 | (let [fields (fields dbf)] |
75 | (dotimes [recno (.getRecordCount dbf)] |
76 | (.read dbf) |
77 | (fn recno fields)))) |
78 |
|
79 | ;; (with-open [dbf (org.xBaseJ.DBF. (kutils/expand-file-name "~/data-sets/tiger-line/data/tmp/tl_2008_us_state.dbf"))] |
80 | ;; (for-each-row (fn [recno fields] |
81 | ;; (doseq [field fields] |
82 | ;; (prn (format "recno[%d] [%s/%s]=%s" |
83 | ;; recno |
84 | ;; (.getType field) |
85 | ;; (.getName field) |
86 | ;; (.trim (.get field)))))) |
87 | ;; dbf)) |
88 |
|
89 | (defn field-names [#^org.xBaseJ.DBF dbf] |
90 | (map #(.getName %) (fields dbf))) |
91 |
|
92 | ;; (field-names (org.xBaseJ.DBF. (kutils/expand-file-name "~/data-sets/tiger-line/data/tmp/tl_2008_us_state.dbf"))) |
93 |
|
94 | (defn dbf->tabfile [dbfile tabfile] |
95 | (with-open [dbf (org.xBaseJ.DBF. dbfile)] |
96 | (with-open [outp (java.io.PrintWriter. tabfile)] |
97 | (binding [*out* outp] |
98 | (println (str/str-join "\t" (field-names dbf))) |
99 | (for-each-row |
100 | (fn [recno fields] |
101 | (println (str/str-join "\t" (map #(.trim (.get %)) fields)))) |
102 | dbf))))) |
103 |
|
104 | '(dbf->tabfile |
105 | (kutils/expand-file-name "~/data-sets/tiger-line/data/tmp/tl_2008_us_state.dbf") |
106 | (kutils/expand-file-name "~/us_state.tab")) |
107 |
|
108 | '(dbf->tabfile |
109 | (kutils/expand-file-name "~/data-sets/tiger-line/data/tmp/tl_2008_us_county.dbf") |
110 | (kutils/expand-file-name "~/us_county.tab")) |
111 |
|
112 | '(dbf->tabfile |
113 | (kutils/expand-file-name "~/data-sets/tiger-line/data/tmp/tl_2008_us_zcta5.dbf") |
114 | (kutils/expand-file-name "~/us_zip_city.tab")) |
115 |
|
116 | '(dbf->tabfile |
117 | (kutils/expand-file-name "~/data-sets/tiger-line/data/PENNSYLVANIA/tmp/tl_2008_42_place.dbf") |
118 | (kutils/expand-file-name "~/pa_place.tab")) |
119 |
|
120 | '(dbf->tabfile |
121 | (kutils/expand-file-name "~/data-sets/tiger-line/data/PENNSYLVANIA/42045_Delaware_County/tmp/tl_2008_42045_addr.dbf") |
122 | (kutils/expand-file-name "~/pa_deleware_addr.tab")) |
123 |
|
124 | '(dbf->tabfile |
125 | (kutils/expand-file-name "~/data-sets/tiger-line/data/PENNSYLVANIA/42045_Delaware_County/tmp/tl_2008_42045_addrfn.dbf") |
126 | (kutils/expand-file-name "~/pa_deleware_addrfn.tab")) |
127 |
|
128 | '(dbf->tabfile |
129 | (kutils/expand-file-name "~/data-sets/tiger-line/data/PENNSYLVANIA/42045_Delaware_County/tmp/tl_2008_42045_featnames.dbf") |
130 | (kutils/expand-file-name "~/pa_deleware_featnames.tab")) |
131 |
|
132 |
|
133 | ;; (def ua (org.apache.commons.httpclient.HttpClient.)) |
134 | ;; (def req (org.apache.commons.httpclient.methods.GetMethod. "http://www.dbase.com/knowledgebase/int/db7_file_fmt.htm")) |
135 | ;; (def resp (.executeMethod ua req)) |
136 |
|
137 | ;; #'req |
138 |
|
139 |
|
140 | '(def type-descr |
141 | (for [row (map lparse/row->cells |
142 | (lparse/table-rows |
143 | (lparse/extract-from |
144 | (.getResponseBodyAsString req) |
145 | '((:fp "Storage of dBASE") |
146 | (:fp "</tr>")) |
147 | '((:ft "</table")))))] |
148 | (map #(let [cell (lparse/extract-from % '((:ft "</font>") |
149 | (:rt ">")) |
150 | '((:ft "</font>")))] |
151 | (.replaceAll |
152 | (.replaceAll cell "[\r\n]+" "") |
153 | " +" " ")) |
154 | row))) |
155 |
|
156 | '(print (str/str-join "\n" (map #(str/str-join "\t" %) type-descr))) |
157 |
|
158 | ;; B Binary, a string 10 digits representing a .DBT block number. |
159 | ;; The number is stored as a string, right justified and padded with blanks. |
160 | ;; C Character All OEM code page characters - padded with blanks to the width of the field. |
161 | ;; D Date 8 bytes - date stored as a string in the format YYYYMMDD. |
162 | ;; N Numeric Number stored as a string, right justified, and padded with blanks to the width of the field. |
163 | ;; L Logical 1 byte - initialized to 0x20 (space) otherwise T or F. |
164 | ;; M Memo, a string 10 digits (bytes) representing a .DBT block number. The number is stored as a |
165 | ;; string, right justified and padded with blanks. |
166 | ;; @ Timestamp 8 bytes - two longs, first for date, second for time. |
167 | ;; The date is the number of days since 01/01/4713 BC. |
168 | ;; Time is hours * 3600000L + minutes * 60000L + Seconds * 1000L |
169 | ;; I Long 4 bytes. Leftmost bit used to indicate sign, 0 negative. |
170 | ;; + Autoincrement Same as a Long |
171 | ;; F Float Number stored as a string, right justified, and padded with blanks to the width of the field. |
172 | ;; O Double 8 bytes - no conversions, stored as a double. |
173 | ;; G OLE 10 digits (bytes) representing a .DBT block number. |
174 | ;; The number is stored as a string, right justified and padded with blanks.nil |