WiLiKiソース解読:src/rssmix.cgi
#!/home/shiro/bin/gosh ;;; ;;; wiliki/rssmix - Fetch and show RSSs ;;; ;;; Copyright (c) 2003 Shiro Kawai, All rights reserved. ;;; ;;; Permission is hereby granted, free of charge, to any person ;;; obtaining a copy of this software and associated documentation ;;; files (the "Software"), to deal in the Software without restriction, ;;; including without limitation the rights to use, copy, modify, ;;; merge, publish, distribute, sublicense, and/or sell copies of ;;; the Software, and to permit persons to whom the Software is ;;; furnished to do so, subject to the following conditions: ;;; ;;; The above copyright notice and this permission notice shall be ;;; included in all copies or substantial portions of the Software. ;;; ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES ;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ;;; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN ;;; AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF ;;; OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ;;; IN THE SOFTWARE. ;;; ;;; $Id: rssmix.cgi,v 1.10 2003/02/20 22:05:05 shirok Exp $ ;;; ;; THIS IS AN EXPERIMENTAL SCRIPT. Eventually this will be a part of ;; WiLiKi release. ;; ;; Requires the newest CVS snapshot of Gauche as of 2003/2/16, in which ;; a crucial bug about multithreading is fixed. ;; Requires SXML-gauche 0.9.
(use srfi-1) (use srfi-2) (use srfi-13) (use srfi-14) (use srfi-19) (use rfc.http) (use rfc.uri) (use text.html-lite) (use util.list) (use sxml.ssax) (use gauche.threads) (use gauche.uvector) (use gauche.regexp) (use gauche.charconv) (use dbm) (use www.cgi)
(autoload dbm.gdbm <gdbm>)
(define-constant USER_AGENT "wiliki/rssmix http://www.shiro.dreamhost.com/scheme/wiliki/rssmix.cgi")
(define-constant NAMESPACES '((rdf . "http://www.w3.org/1999/02/22-rdf-syntax-ns#") (rss . "http://purl.org/rss/1.0/") (dc . "http://purl.org/dc/elements/1.1/")))
(define-class <rssmix> () ((sites :init-keyword :sites :init-value '()) ;; - list of monitoring sites. Each entry should be ;; (IDENT HOME-URI RSS-URI) (num-items :init-keyword :num-items :init-value 70) ;; - # of entries to show (title :init-keyword :title :init-value "Recent Changes") (db-name :init-keyword :db-name :init-value "/home/shiro/data/rssmix.dbm") (db-type :init-keyword :db-type :init-form <gdbm>) (cache-life :init-keyword :cache-life :init-value 1800) ;; - lifetime of cache, in seconds. (fetch-timeout :init-keyword :fetch-timeout :init-value 15) ;; - timeout value to fetch RSS (max-title-width :init-keyword :max-title-width :init-value 65) ;; - entry longer than this will be truncated (db :init-value #f) ;; - opened dbm instance (db-lock :init-form (make-mutex)) ;; - mutex for db ))
;; temporary structure to represent site item info (define-class <rss-item> () ((site-id :init-keyword :site-id) (site-url :init-keyword :site-url) (title :init-keyword :title) (link :init-keyword :link) (date :init-keyword :date) ))
(define-syntax with-rss-db (syntax-rules () ((_ self . body) (let* ((s self) (lock (ref s 'db-lock))) (dynamic-wind (lambda () (mutex-lock! lock)) (lambda () (let1 db (dbm-open (ref s 'db-type) :path (ref s 'db-name) :rwmode :write) (set! (ref s 'db) db) (with-error-handler (lambda (e) (dbm-close db) (raise e)) (lambda () (receive r (begin . body) (dbm-close db) (apply values r)))))) (lambda () (mutex-unlock! lock)))) )))
;; an ad-hoc function to estimate width of the string (define (char-width ch) (if (< (char->integer ch) 256) 1 2))
(define (string-width str) (string-fold (lambda (ch w) (+ w (char-width ch))) 0 str))
(define (string-chop str width) (with-string-io str (lambda () (let loop ((w 0) (ch (read-char))) (unless (or (eof-object? ch) (> w width)) (write-char ch) (loop (+ w (char-width ch)) (read-char)))))))
(define (rss-format-date unix-time) (sys-strftime "%Y/%m/%d %H:%M:%S %Z" (sys-localtime unix-time)))
(define-method rss-page ((self <rssmix>) title body) `("Content-Style-Type: text/css\n" ,(cgi-header :content-type #`"text/html; charset=\"EUC-JP\"") ,(html-doctype :type :transitional) ,(html:html (html:head (html:title (html-escape-string title)) (html:link :rel "stylesheet" :href "wiliki-sample.css" :type "text/css")) (html:body (html:h1 (html-escape-string title)) (html:div :align "right" "[" (html:a :href "http://www.shiro.dreamhost.com/scheme/wiliki/wiliki.cgi?WiLiKi:RSSMix"" "[" (html:a :href "?c=info" "Sources") "]") (html:hr) body))))
(define-method rss-error-page ((self <rssmix>) e) (rss-page self "Error" (html:p (html-escape-string (ref e 'message)))))
(define-method rss-recent-changes ((self <rssmix>)) (rss-page self (ref self 'title) (html:table (map (lambda (item) (html:tr (html:td (rss-format-date (ref item 'date))) (html:td (let* ((id (ref item 'site-id)) (title (ref item 'title)) (titlew (string-width title)) (len (- (ref self 'max-title-width) (+ (string-width id) titlew))) ) (when (negative? len) (set! title #`",(string-chop title (+ titlew len)) ...")) (list (html:a :href (ref item 'site-url) (html-escape-string id)) ": " (html:a :href (ref item 'link) (html-escape-string title)) ) )))) (take* (collect self) (ref self 'num-items))) )))
(define-method rss-site-info ((self <rssmix>)) (let* ((sites (ref self 'sites)) (infos (with-rss-db self (map (lambda (s) (read-from-string (dbm-get (ref self 'db) (car s) "#f"))) sites))) ) (rss-page self "RSSMix: Site Info" (map (lambda (site info) `(,(html:h3 (html-escape-string (car site))) ,(html:table (html:tr (html:td "Title") (html:td (get-keyword :channel-title info "--"))) (html:tr (html:td "Top") (html:td (html:a :href (cadr site) (html-escape-string (cadr site))))) (html:tr (html:td "RSS") (html:td (html:a :href (caddr site) (html-escape-string (caddr site))))) (html:tr (html:td "Last fetched") (html:td (or (and-let* ((info) (ts (get-keyword :timestamp info #f))) (rss-format-date ts)) "--"))) (html:tr (html:td "Time spent") (html:td (or (and-let* ((info) (ts (get-keyword :elapsed info #f))) ts) "--"))) ))) sites infos) )))
(define-method rss-main ((self <rssmix>)) (cgi-main (lambda (params) (let1 command (cgi-get-parameter "c" params :default "list") (cond ((equal? command "info") (rss-site-info self)) ((equal? command "list") (rss-recent-changes self)) (else (error "Unknown command" command))))) :on-error (lambda (e) (rss-error-page self e))) 0)
;; Collect RSS info from given sites. (define (collect self) (let* ((sites (ref self 'sites)) (getters (with-rss-db self (map (lambda (site) (get-rss self (car site) (caddr site))) sites))) (timeout (add-duration (current-time) ;; NB: this requires fixed srfi-19.scm (make-time 'time-duration 0 (ref self 'fetch-timeout)))) ) (sort (append-map (lambda (site getter) (or (and-let* ((items (getter timeout))) (map (lambda (item) (make <rss-item> :site-id (car site) :site-url (cadr site) :title (car item) :link (cadr item) :date (caddr item))) items)) '())) sites getters) (lambda (a b) (> (ref a 'date) (ref b 'date)))) ))
;; Returns a procedure PROC, that takes a srfi-time and returns RSS data, ;; which is a list of (TITLE LINK UNIX-TIME). ;; The time passed to PROC specifies a limit when thread can wait to fetch ;; the RSS. If the RSS is cached and up to date, PROC promptly returns it. ;; If there is no cache or the cache is obsolete, a thread is spawned to ;; fetch RSS. If something goes wrong, PROC returns #f. ;; Cache is updated accodringly within PROC. ;; NB: this is called from primordial thread, so we don't need to lock db. (define (get-rss self id rss-url) (let* ((cached (and-let* ((body (dbm-get (ref self 'db) id #f))) (read-from-string body))) (timestamp (and cached (get-keyword :timestamp cached 0))) (rss (and cached (get-keyword :rss-cache cached #f))) (now (sys-time)) ) (if (and rss (> timestamp (- now (ref self 'cache-life)))) (lambda (timeout) rss) ;; active (let1 t (thread-start! (make-thread (make-thunk self id rss-url now) id)) (lambda (timeout) (let1 r (thread-join! t timeout 'timeout) (if (eq? r 'timeout) (begin (record-timeout self id) rss) r))))) ))
;; Record the fact that timeout occurred. Must be called from main thread. (define (record-timeout self id) (with-rss-db self (and-let* ((db (ref self 'db)) (cached (read-from-string (dbm-get db id "#f"))) (channel-title (get-keyword :channel-title cached #f)) (timestamp (get-keyword :timestamp cached #f)) (rss-cache (get-keyword :rss-cache cached #f)) (data (list :timestamp timestamp :rss-cache rss-cache :channel-title channel-title :elapsed 'timeout))) (dbm-put! db id (write-to-string data))))) ;; Creates a thunk for thread. (define (make-thunk self id uri start-time) (lambda () (with-error-handler (lambda (e) (display (ref e 'message) (current-error-port)) #f) (lambda () (let1 rss (fetch uri) (and rss (let* ((now (sys-time)) (data (list :timestamp now :rss-cache (cdr rss) :channel-title (car rss) :elapsed (- now start-time)))) (with-rss-db self (dbm-put! (ref self 'db) id (write-to-string data))) (cdr rss))) )) )))
;; Fetch RSS from specified URI, parse it, and extract link information ;; with updated dates. Returns list of items, in ;; (TITLE URI DATETIME) ;; where DATETIME is in time-utc. ;; When error, returns #f. (define (fetch uri) (and-let* ((match (#/^http:\/\/([^\/]+)/ uri)) (server (match 1)) (path (match 'after))) (receive (status headers body) (http-get server path :user-agent USER_AGENT) (and-let* (((equal? status "200")) ((string? body)) (encoding (body-encoding body))) (extract-from-rdf (SSAX:XML->SXML (wrap-with-input-conversion (open-input-string body) encoding) NAMESPACES)))) ))
;; Figure out the encoding of the returned body. At this point, ;; the body might be an incomplete string, so we have to be careful. ;; Returns #f if body is not a valid xml doc. (define (body-encoding body) (and-let* ((body (string-complete->incomplete body)) (before (string-scan body #*"?>" 'before)) (enc (string-scan before #*"encoding=\"" 'after)) (enc2 (string-scan enc #*"\"" 'before))) enc2))
;; Traverse RDF tree and obtain necessary info. ;; It would be better to use SXPath, but for now... (define (extract-from-rdf sxml)
(define (find-node tag parent) (and (pair? parent) (find (lambda (n) (eq? (car n) tag)) (cdr parent))))
(define (filter-node tag parent) (and (pair? parent) (filter (lambda (n) (eq? (car n) tag)) (cdr parent))))
;; NB: srfi-19's string->date fails to recognize time zone offset ;; with ':' between hours and minutes. I need to parse it manually. (define (parse-date date) (and-let* ((match (#/^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)([+-]\d\d):(\d\d)/ date))) (receive (year month day hour minute second zh zm) (apply values (map (lambda (i) (x->integer (match i))) (iota 8 1))) (time-second (date->time-utc (make-date 0 second minute hour day month year (* (if (negative? zh) -1 1) (+ (* (abs zh) 3600) (* zm 60)))))) ))) (let* ((rdf (find-node 'rdf:RDF sxml)) (chan (find-node 'rss:channel rdf)) (chan-title (find-node 'rss:title chan)) (items (filter-node 'rss:item rdf))) (cons (and (pair? chan-title) (if (and (pair? (cadr chan-title)) (eq? (caadr chan-title) '@)) (caddr chan-title) (cadr chan-title))) (filter-map (lambda (item) (let ((title (and-let* ((n (find-node 'rss:title item))) (cadr n))) (link (and-let* ((n (find-node 'rss:link item))) (cadr n))) (date (and-let* ((n (find-node 'dc:date item))) (parse-date (cadr n))))) (and title link date (list title link date)))) items))) )
;; Entry-point (define (main args) (rss-main (make <rssmix> :sites '(("WiLiKi" "http://www.shiro.dreamhost.com/scheme/wiliki/wiliki.cgi" "http://www.shiro.dreamhost.com/scheme/wiliki/wiliki.cgi?c=rss") ("SchemeXref" "http://www.shiro.dreamhost.com/scheme/wiliki/schemexref.cgi" "http://www.shiro.dreamhost.com/scheme/wiliki/schemexref.cgi?c=rss") ("ねるWiki" "http://www.soraneko.com/~nel/wiliki.cgi" "http://www.soraneko.com/~nel/wiliki.cgi?c=rss") ("スラド" "http://slashdot.jp/" "http://slashdot.jp/slashdot.rdf") ("On Off and Beyond" "http://blog.neoteny.com/chika/" "http://blog.neoteny.com/chika/index.rdf") ("WikiLike" "http://ishinao.net/WikiLike/" "http://ishinao.net/WikiLike/rss.php") ("@pm" "http://gnk.s15.xrea.com/" "http://gnk.s15.xrea.com/index.rdf") ("wiki on ishinao.net" "http://ishinao.net/pukiwiki/" "http://ishinao.net/pukiwiki/?cmd=rss") ) :title "RSSMix: Recent Entries")))
;; Local variables: ;; mode: scheme ;; end: