1 | ;; Code for scraping down my livejoural posts -- I'm converting to my |
2 | ;; own site, built via Jekyll as of May 2009... |
3 |
|
4 | (ns com.github.kyleburton.sandbox.lj |
5 | (:use [com.github.kyleburton.sandbox.web :as web] |
6 | [com.github.kyleburton.sandbox.landmark-parser :as lp] |
7 | [com.github.kyleburton.sandbox.utils :as kutils] |
8 | [clojure.contrib.duck-streams :as ds])) |
9 |
|
10 | ;; (def *ljurl* "http://kyle-burton.livejournal.com/") |
11 | ;; (def *ljurl* "http://kyle-burton.livejournal.com/?skip=20") |
12 | (def *ljurl* "http://kyle-burton.livejournal.com/?skip=40") |
13 | (def *jekyll-root* (kutils/$HOME "personal/projects/this-blog")) |
14 |
|
15 | (def main-page (web/get->string *ljurl*)) |
16 |
|
17 | (defn get-posts [html] |
18 | (lp/extract-all-from html |
19 | '(:ft "subj-link" :rp "<table") |
20 | '(:fp "post comment" :fp "</table>")) ) |
21 |
|
22 | (def *months* {"Jan" 1 |
23 | "Feb" 2 |
24 | "Mar" 3 |
25 | "Apr" 4 |
26 | "May" 5 |
27 | "Jun" 6 |
28 | "Jul" 7 |
29 | "Aug" 8 |
30 | "Sep" 9 |
31 | "Oct" 10 |
32 | "Nov" 11 |
33 | "Dec" 12}) |
34 |
|
35 | (defn str->mon [s] |
36 | (*months* |
37 | (first (filter #(.startsWith (.toLowerCase s) (.toLowerCase %)) |
38 | (keys *months*))))) |
39 |
|
40 | (defn parse-lj-date-time [date] |
41 | (let [[date time] (seq (.split (web/strip-html (.replaceAll date "[\\[\\]]" "")) "\\|" 2)) |
42 | [month-name day-of-month year] (kutils/re-find-first #"\s*(\S+)\s+(\S+),\s+(\S+)" date) |
43 | [hour min am-pm] (kutils/re-find-first #"\s*(\S+):(\S+)\s+(\S+)" time)] |
44 | {:year (Integer/parseInt year) |
45 | :month (str->mon month-name) |
46 | :day (Integer/parseInt (.replaceAll day-of-month "[^0-9]+" "")) |
47 | :full-date date |
48 | :am-pm am-pm |
49 | :hour (Integer/parseInt hour) |
50 | :hour-military (if (= "pm" am-pm) |
51 | (+ 12 (Integer/parseInt hour)) |
52 | (Integer/parseInt hour)) |
53 | :min (Integer/parseInt min) |
54 | :sec 0 |
55 | :full-time time})) |
56 |
|
57 | (defn parse-post [post] |
58 | (let [parser (lp/make-parser post)] |
59 | {:title (lp/extract parser '(:ft "subj-link" :fp ">") '(:ft "<")) |
60 | :date (parse-lj-date-time (lp/extract parser '(:ft "<td " :fp ">") '(:ft "</td"))) |
61 | :tags (if (not (= -1 (.indexOf post "<b>Tags</b>"))) |
62 | (vec (.split (lp/extract-from post |
63 | '(:ft "Tags" :fp "href=" :fp ">") |
64 | '(:ft "<")) |
65 | "\\s+")) |
66 | []) |
67 | :body (lp/extract parser '(:fp "<td colspan=") '(:fp "'comments'" :rp "</td>" ))})) |
68 |
|
69 |
|
70 | ;; (:date (parse-post (first (get-posts main-page)))) |
71 |
|
72 | (defn title->safename [title] |
73 | (.trim |
74 | (.replaceAll |
75 | (.replaceAll (.toLowerCase title) "[.]+" "") |
76 | "[^a-zA-Z0-9]+" "-"))) |
77 |
|
78 | (defn post->file-name [post] |
79 | (let [{:keys [date title tags]} post |
80 | {:keys [year month day]} date |
81 | ] |
82 | (format "%04d-%02d-%02d-%s.textile" |
83 | year month day (title->safename title)))) |
84 |
|
85 | (defn post->full-pathanme [post] |
86 | (str *jekyll-root* "/site/_posts/" (post->file-name post))) |
87 |
|
88 | (doseq [post (map parse-post (get-posts main-page))] |
89 | (with-open [out (ds/writer (post->full-pathanme post))] |
90 | (binding [*out* out] |
91 | (println "---") |
92 | (println "layout: default") |
93 | (println (str "title: " (:title post))) |
94 | (println "---") |
95 | (println (:body post))))) |
96 |
|
97 | ;; (post->file-name (parse-post (nth (get-posts main-page) 1))) |
98 | ;; "2008-10-21-cloud-con-east-notes.textile" |
99 |
|
100 | ;; (ds/spit (kutils/$HOME "/tmp/lj.html") |
101 | ;; (first (get-posts main-page))) |
102 |
|
103 |
|
104 | ;; (apply concat (map :tags (for [post (get-posts main-page)] |
105 | ;; (parse-post post)))) |