WiLiKiソース解読:src/rssmix.cgi

WiLiKiソース解読:src/rssmix.cgi

 #!/home/shiro/bin/gosh
 ;;;
 ;;; wiliki/rssmix - Fetch and show RSSs
 ;;;
 ;;;  Copyright (c) 2003 Shiro Kawai, All rights reserved.
 ;;;
 ;;;  Permission is hereby granted, free of charge, to any person
 ;;;  obtaining a copy of this software and associated documentation
 ;;;  files (the "Software"), to deal in the Software without restriction,
 ;;;  including without limitation the rights to use, copy, modify,
 ;;;  merge, publish, distribute, sublicense, and/or sell copies of
 ;;;  the Software, and to permit persons to whom the Software is
 ;;;  furnished to do so, subject to the following conditions:
 ;;;
 ;;;  The above copyright notice and this permission notice shall be
 ;;;  included in all copies or substantial portions of the Software.
 ;;;
 ;;;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 ;;;  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 ;;;  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 ;;;  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 ;;;  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 ;;;  AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 ;;;  OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 ;;;  IN THE SOFTWARE.
 ;;;
 ;;;  $Id: rssmix.cgi,v 1.10 2003/02/20 22:05:05 shirok Exp $
 ;;;
 
 ;; THIS IS AN EXPERIMENTAL SCRIPT.  Eventually this will be a part of
 ;; WiLiKi release.
 ;;
 ;; Requires the newest CVS snapshot of Gauche as of 2003/2/16, in which
 ;;   a crucial bug about multithreading is fixed.
 ;; Requires SXML-gauche 0.9.
 (use srfi-1)
 (use srfi-2)
 (use srfi-13)
 (use srfi-14)
 (use srfi-19)
 (use rfc.http)
 (use rfc.uri)
 (use text.html-lite)
 (use util.list)
 (use sxml.ssax)
 (use gauche.threads)
 (use gauche.uvector)
 (use gauche.regexp)
 (use gauche.charconv)
 (use dbm)
 (use www.cgi)
 (autoload dbm.gdbm <gdbm>)
 (define-constant USER_AGENT
  "wiliki/rssmix http://www.shiro.dreamhost.com/scheme/wiliki/rssmix.cgi")
 (define-constant NAMESPACES
  '((rdf . "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    (rss . "http://purl.org/rss/1.0/")
    (dc  . "http://purl.org/dc/elements/1.1/")))
 (define-class <rssmix> ()
  ((sites :init-keyword :sites :init-value '())
   ;; - list of monitoring sites.  Each entry should be
   ;;     (IDENT HOME-URI RSS-URI)
   (num-items :init-keyword :num-items :init-value 70)
   ;; - # of entries to show
   (title :init-keyword :title :init-value "Recent Changes")
   (db-name :init-keyword :db-name :init-value "/home/shiro/data/rssmix.dbm")
   (db-type :init-keyword :db-type :init-form <gdbm>)
   (cache-life :init-keyword :cache-life :init-value 1800)
   ;; - lifetime of cache, in seconds.
   (fetch-timeout :init-keyword :fetch-timeout :init-value 15)
   ;; - timeout value to fetch RSS
   (max-title-width :init-keyword :max-title-width :init-value 65)
   ;; - entry longer than this will be truncated
   (db      :init-value #f)
   ;; - opened dbm instance
   (db-lock :init-form (make-mutex))
   ;; - mutex for db
   ))
 ;; temporary structure to represent site item info
 (define-class <rss-item> ()
  ((site-id  :init-keyword :site-id)
   (site-url :init-keyword :site-url)
   (title    :init-keyword :title)
   (link     :init-keyword :link)
   (date     :init-keyword :date)
   ))
 (define-syntax with-rss-db
  (syntax-rules ()
    ((_ self . body)
     (let* ((s  self)
            (lock (ref s 'db-lock)))
       (dynamic-wind
        (lambda () (mutex-lock! lock))
        (lambda ()
          (let1 db (dbm-open (ref s 'db-type)
                             :path (ref s 'db-name) :rwmode :write)
            (set! (ref s 'db) db)
            (with-error-handler
                (lambda (e) (dbm-close db) (raise e))
              (lambda ()
                (receive r (begin . body)
                  (dbm-close db)
                  (apply values r))))))
        (lambda () (mutex-unlock! lock))))
     )))
 ;; an ad-hoc function to estimate width of the string
 (define (char-width ch)
  (if (< (char->integer ch) 256) 1 2))
 (define (string-width str)
  (string-fold (lambda (ch w) (+ w (char-width ch))) 0 str))
 (define (string-chop str width)
  (with-string-io str
    (lambda ()
      (let loop ((w 0) (ch (read-char)))
        (unless (or (eof-object? ch) (> w width))
          (write-char ch)
          (loop (+ w (char-width ch)) (read-char)))))))
 (define (rss-format-date unix-time)
  (sys-strftime "%Y/%m/%d %H:%M:%S %Z" (sys-localtime unix-time)))
 (define-method rss-page ((self <rssmix>) title body)
  `("Content-Style-Type: text/css\n"
    ,(cgi-header :content-type #`"text/html; charset=\"EUC-JP\"")
    ,(html-doctype :type :transitional)
    ,(html:html
      (html:head
       (html:title (html-escape-string title))
       (html:link :rel "stylesheet" :href "wiliki-sample.css"
                  :type "text/css"))
      (html:body
       (html:h1 (html-escape-string title))
       (html:div :align "right"
                 "[" (html:a :href "http://www.shiro.dreamhost.com/scheme/wiliki/wiliki.cgi?WiLiKi:RSSMix""
                 "[" (html:a :href "?c=info" "Sources") "]")
       (html:hr)
       body))))
 (define-method rss-error-page ((self <rssmix>) e)
  (rss-page
   self
   "Error"
   (html:p (html-escape-string (ref e 'message)))))
 (define-method rss-recent-changes ((self <rssmix>))
  (rss-page
   self
   (ref self 'title)
   (html:table
    (map (lambda (item)
           (html:tr
            (html:td (rss-format-date (ref item 'date)))
            (html:td
             (let* ((id (ref item 'site-id))
                    (title (ref item 'title))
                    (titlew (string-width title))
                    (len (- (ref self 'max-title-width)
                            (+ (string-width id) titlew)))
                    )
               (when (negative? len)
                 (set! title #`",(string-chop title (+ titlew len)) ..."))
               (list
                (html:a :href (ref item 'site-url) (html-escape-string id))
                ": "
                (html:a :href (ref item 'link) (html-escape-string title))
                )
               ))))
         (take* (collect self) (ref self 'num-items)))
    )))
 (define-method rss-site-info ((self <rssmix>))
  (let* ((sites (ref self 'sites))
         (infos (with-rss-db self
                  (map (lambda (s)
                         (read-from-string (dbm-get (ref self 'db) (car s) "#f")))
                       sites)))
         )
    (rss-page
     self
     "RSSMix: Site Info"
     (map (lambda (site info)
            `(,(html:h3 (html-escape-string (car site)))
              ,(html:table
                (html:tr (html:td "Title")
                         (html:td (get-keyword :channel-title info "--")))
                (html:tr (html:td "Top")
                         (html:td (html:a :href (cadr site)
                                          (html-escape-string (cadr site)))))
                (html:tr (html:td "RSS")
                         (html:td (html:a :href (caddr site)
                                          (html-escape-string (caddr site)))))
                (html:tr (html:td "Last fetched")
                         (html:td
                          (or (and-let* ((info)
                                         (ts (get-keyword :timestamp info #f)))
                                (rss-format-date ts))
                              "--")))
                (html:tr (html:td "Time spent")
                         (html:td
                          (or (and-let* ((info)
                                         (ts (get-keyword :elapsed info #f)))
                                ts)
                              "--")))
                )))
          sites infos)
     )))
 (define-method rss-main ((self <rssmix>))
  (cgi-main
   (lambda (params)
     (let1 command (cgi-get-parameter "c" params :default "list")
       (cond
        ((equal? command "info") (rss-site-info self))
        ((equal? command "list") (rss-recent-changes self))
        (else (error "Unknown command" command)))))
   :on-error (lambda (e) (rss-error-page self e)))
  0)
 ;; Collect RSS info from given sites.
 (define (collect self)
  (let* ((sites  (ref self 'sites))
         (getters (with-rss-db self
                    (map (lambda (site)
                           (get-rss self (car site) (caddr site)))
                         sites)))
         (timeout (add-duration
                   (current-time)
                   ;; NB: this requires fixed srfi-19.scm
                   (make-time 'time-duration 0 (ref self 'fetch-timeout))))
         )
    (sort (append-map
           (lambda (site getter)
             (or (and-let* ((items (getter timeout)))
                   (map (lambda (item)
                          (make <rss-item>
                            :site-id (car site) :site-url (cadr site)
                            :title (car item) :link (cadr item)
                            :date (caddr item)))
                        items))
                 '()))
           sites getters)
          (lambda (a b) (> (ref a 'date) (ref b 'date))))
    ))
 ;; Returns a procedure PROC, that takes a srfi-time and returns RSS data,
 ;; which is a list of (TITLE LINK UNIX-TIME).
 ;; The time passed to PROC specifies a limit when thread can wait to fetch
 ;; the RSS.  If the RSS is cached and up to date, PROC promptly returns it.
 ;; If there is no cache or the cache is obsolete, a thread is spawned to
 ;; fetch RSS.   If something goes wrong, PROC returns #f.
 ;; Cache is updated accodringly within PROC.
 ;; NB: this is called from primordial thread, so we don't need to lock db.
 (define (get-rss self id rss-url)
  (let* ((cached (and-let* ((body  (dbm-get (ref self 'db) id #f)))
                   (read-from-string body)))
         (timestamp (and cached (get-keyword :timestamp cached 0)))
         (rss    (and cached (get-keyword :rss-cache cached #f)))
         (now    (sys-time))
         )
    (if (and rss (> timestamp (- now (ref self 'cache-life))))
        (lambda (timeout) rss)  ;; active
        (let1 t
            (thread-start! (make-thread (make-thunk self id rss-url now) id))
          (lambda (timeout)
            (let1 r (thread-join! t timeout 'timeout)
              (if (eq? r 'timeout)
                  (begin (record-timeout self id) rss)
                  r)))))
    ))
 ;; Record the fact that timeout occurred.  Must be called from main thread.
 (define (record-timeout self id)
  (with-rss-db self
    (and-let* ((db (ref self 'db))
               (cached (read-from-string (dbm-get db id "#f")))
               (channel-title (get-keyword :channel-title cached #f))
               (timestamp (get-keyword :timestamp cached #f))
               (rss-cache (get-keyword :rss-cache cached #f))
               (data (list :timestamp timestamp :rss-cache rss-cache
                           :channel-title channel-title
                           :elapsed 'timeout)))
      (dbm-put! db id (write-to-string data)))))
                                
 ;; Creates a thunk for thread.
 (define (make-thunk self id uri start-time)
  (lambda ()
    (with-error-handler
        (lambda (e)
          (display (ref e 'message) (current-error-port))
          #f)
      (lambda ()
        (let1 rss (fetch uri)
          (and rss
               (let* ((now (sys-time))
                      (data (list :timestamp now :rss-cache (cdr rss)
                                  :channel-title (car rss)
                                  :elapsed (- now start-time))))
                 (with-rss-db self
                   (dbm-put! (ref self 'db) id (write-to-string data)))
                 (cdr rss)))
          ))
      )))
 ;; Fetch RSS from specified URI, parse it, and extract link information
 ;; with updated dates.  Returns list of items, in
 ;;  (TITLE URI DATETIME)
 ;; where DATETIME is in time-utc.
 ;; When error, returns #f.
 (define (fetch uri)
  (and-let* ((match  (#/^http:\/\/([^\/]+)/ uri))
             (server (match 1))
             (path   (match 'after)))
    (receive (status headers body)
        (http-get server path :user-agent USER_AGENT)
      (and-let* (((equal? status "200"))
                 ((string? body))
                 (encoding (body-encoding body)))
        (extract-from-rdf
         (SSAX:XML->SXML
          (wrap-with-input-conversion (open-input-string body) encoding)
          NAMESPACES))))
    ))
 ;; Figure out the encoding of the returned body.  At this point,
 ;; the body might be an incomplete string, so we have to be careful.
 ;; Returns #f if body is not a valid xml doc.
 (define (body-encoding body)
  (and-let* ((body   (string-complete->incomplete body))
             (before (string-scan body #*"?>" 'before))
             (enc    (string-scan before #*"encoding=\"" 'after))
             (enc2   (string-scan enc #*"\"" 'before)))
    enc2))
 ;; Traverse RDF tree and obtain necessary info.
 ;; It would be better to use SXPath, but for now...
 (define (extract-from-rdf sxml)
  (define (find-node tag parent)
    (and (pair? parent)
         (find (lambda (n) (eq? (car n) tag)) (cdr parent))))
  (define (filter-node tag parent)
    (and (pair? parent)
         (filter (lambda (n) (eq? (car n) tag)) (cdr parent))))
  ;; NB: srfi-19's string->date fails to recognize time zone offset
  ;; with ':' between hours and minutes.  I need to parse it manually.
  (define (parse-date date)
    (and-let* 
        ((match (#/^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)([+-]\d\d):(\d\d)/ date)))
      (receive (year month day hour minute second zh zm)
          (apply values (map (lambda (i) (x->integer (match i))) (iota 8 1)))
        (time-second
         (date->time-utc (make-date 0 second minute hour day month year
                                    (* (if (negative? zh) -1 1)
                                       (+ (* (abs zh) 3600) (* zm 60))))))
        )))
  
  (let* ((rdf   (find-node 'rdf:RDF sxml))
         (chan  (find-node 'rss:channel rdf))
         (chan-title (find-node 'rss:title chan))
         (items (filter-node 'rss:item rdf)))
    (cons
     (and (pair? chan-title)
          (if (and (pair? (cadr chan-title)) (eq? (caadr chan-title) '@))
              (caddr chan-title)
              (cadr chan-title)))
     (filter-map (lambda (item)
                   (let ((title (and-let* ((n (find-node 'rss:title item)))
                                  (cadr n)))
                         (link  (and-let* ((n (find-node 'rss:link item)))
                                  (cadr n)))
                         (date  (and-let* ((n (find-node 'dc:date item)))
                                  (parse-date (cadr n)))))
                     (and title link date (list title link date))))
                 items)))
  )
 ;; Entry-point
 (define (main args)
  (rss-main
   (make <rssmix>
     :sites '(("WiLiKi"
               "http://www.shiro.dreamhost.com/scheme/wiliki/wiliki.cgi"
               "http://www.shiro.dreamhost.com/scheme/wiliki/wiliki.cgi?c=rss")
              ("SchemeXref"
               "http://www.shiro.dreamhost.com/scheme/wiliki/schemexref.cgi"
               "http://www.shiro.dreamhost.com/scheme/wiliki/schemexref.cgi?c=rss")
              ("ねるWiki"
               "http://www.soraneko.com/~nel/wiliki.cgi"
               "http://www.soraneko.com/~nel/wiliki.cgi?c=rss")
              ("スラド"
               "http://slashdot.jp/"
               "http://slashdot.jp/slashdot.rdf")
              ("On Off and Beyond"
               "http://blog.neoteny.com/chika/"
               "http://blog.neoteny.com/chika/index.rdf")
              ("WikiLike"
               "http://ishinao.net/WikiLike/"
               "http://ishinao.net/WikiLike/rss.php")
              ("@pm"
               "http://gnk.s15.xrea.com/"
               "http://gnk.s15.xrea.com/index.rdf")
              ("wiki on ishinao.net"
               "http://ishinao.net/pukiwiki/"
               "http://ishinao.net/pukiwiki/?cmd=rss")
              )
     :title "RSSMix: Recent Entries")))
 ;; Local variables:
 ;; mode: scheme
 ;; end:
More ...