websteal


websteal

バグ

  1. 基本的に全然完全じゃないです。(ページの最初にバグの項を持ってくるくらい)
  2. href="...." とかの形式に乗っ取ってないとエラーで止まる。つまり実際のリンクじゃなく、コンテンツに埋め込まれた href=&... みたいなの。
  3. href="mailto:..." のやつもファイルと思って残してしまう。
  4. その他、ちゃんとしたパーズしてなくて部分的に(href/srcのみ)切り取っただけなので状況によっては変な動作すると思います。気を付けてください。。。
  5. 何やらコマンドラインから起動すると、エラー発生しちゃうことがある。(今は動いているので追求してません)

要はちゃんと文書全体をパーズしてなくて、 リンクとおぼしき部分にマッチするものを 決め打ちで引っ掛けてるので。

メモ

web サイトからコンテンツをごっそり引っ張って来る。
ルートになるファイルを指定して実行するとリンクをたどって 次々に持って来るようにしている。 もちろん別のホストへのリンクは引っ張らない。(想像するだけで恐いから...)

以下のように SICP とか t-y-scheme みたいなサイトを自分の端末上に ごっそり欲しかったのが書いてみた動機。

HTML 関係あんまり詳しく勉強してないので、 try&error で作ってます。 これから勉強する予定cut-sea:2004/07/25 06:10:31 PDT

  1. "href=" や "src=" は捕捉不要で child-list もすっきりさせる。
  2. get-web-page-from の返り値を Content-Type にする。
  3. websteal 本体がその返り値を見てパーズ+child-list生成するかどうか分岐。
    って組めばよさそうですね。websteal 本体の仕事内容の順番見直し要だな。 でも"href="/"src="って気に入ってなかったから丁度いいわ。cut-sea:2004/07/28 16:53:01 PDT
gosh> (load "./websteal.scm")
#t
gosh> (websteal "http://mitpress.mit.edu/sicp/full-text/book/book.html")
getting file "sicp/full-text/book/book.html" from "mitpress.mit.edu" now ... done
getting file "sicp/full-text/book/book-Z-C.css" from "mitpress.mit.edu" now ... done
getting file "sicp/full-text/book/book-Z-H-1.html" from "mitpress.mit.edu" now ... done

(snip)

getting file "sicp/full-text/book/ch1-Z-G-4.gif" from "mitpress.mit.edu" now ... done
getting file "sicp/full-text/book/ch1-Z-G-5.gif" from "mitpress.mit.edu" now ... done
getting file "sicp/full-text/book/ch1-Z-G-6.gif" from "mitpress.mit.edu" now ... done
getting file "sicp/full-text/book/cover.jpg" from "mitpress.mit.edu" now ... done
#<undef>
gosh> 

ソース

":";exec gosh -b $0 "$@"
;;
;; Usage: ./websteal 'http://mitpress.mit.edu/sicp/full-text/book/book.html'
;;
;; $Id$
;;

;;===================================================
;; use library
;;
(use rfc.http)
(use rfc.uri)
(use file.util)
(use srfi-13)

;;===================================================
;; global values
;;
(define *no-wait* 0)
(define *gentle-wait* 10)

;;===================================================
;; miscellaneous functions
;;
(define-syntax format/flush
  (syntax-rules ()
    ((_ arg ...) (begin
                   (format arg ...)
                   (flush)))))

;;===================================================
;; destruct/construct utility for URL and file name.
;;
(define (root-cut path) (list->string (cdr (string->list path))))
(define (root-paste path) (string-append "/" path))
(define (from-root? path) (equal? #\/ (string-ref path 0)))

(define (break-down-url url)
  (let ((url-head (values-ref (uri-scheme&specific url) 0))
        (url-body (values-ref (uri-scheme&specific url) 1)))
    (let ((url-list (call-with-values
                        (lambda _ (uri-decompose-hierarchical url-body)) list)))
      (let ((host (car url-list))
            (dir  (sys-dirname (cadr url-list)))
            (file (sys-basename (cadr url-list))))
        (lambda (msg)
          (cond ((eq? msg 'url) url)
                ((eq? msg 'type) url-head)
                ((eq? msg 'body) url-body)
                ((eq? msg 'host) host)
                ((eq? msg 'file) file)
                ((eq? msg 'webdir) dir)
                ((eq? msg 'fsdir) (root-cut dir))
                ((eq? msg 'web-full-path) (string-append dir (root-paste file)))
                ((eq? msg 'fs-full-path) (string-append (root-cut dir) (root-paste file)))
                (else (error "No such message permitted."))))))))

(define (get-type obj) (obj 'type))
(define (get-body obj) (obj 'body))
(define (get-url obj) (obj 'url))
(define (get-hostname obj) (obj 'host))
(define (get-filename obj) (obj 'file))
(define (get-webdir obj) (obj 'webdir))
(define (get-fsdir obj) (obj 'fsdir))
(define (get-web-full-path obj) (obj 'web-full-path))
(define (get-fs-full-path obj) (obj 'fs-full-path))

(define (construct-complete-url obj default-obj)
  (define (construct-url host dir file)
    (let ((tag "http://"))
      (string-append tag host dir (root-paste file))))
  (call/cc 
   (lambda (c)
     (let ((type (get-type obj))
           (host% (get-hostname default-obj))
           (dir% (get-webdir default-obj))
           (host (get-hostname obj))
           (dir (get-webdir obj)))
       (if (and (not (equal? "http" type))
                (not (equal? #f type)))
           (c #f)
           (construct-url
            (if (eq? #f host)
                host%
                (if (equal? host% host)
                    host
                    (c #f)))
            (if (eq? #f host)
                (if (equal? "." dir)
                    dir%
                    (if (from-root? dir) 
                        dir
                        (string-append dir% (root-paste dir))))
                dir)
            (get-filename obj)))))))

;;===================================================
;; utility to get web page.
;; return value is http-get multiple values as list.
;;
(define (get-web-page-from url)
  (let ((url-obj (break-down-url url)))
    (let ((host (get-hostname url-obj))
          (fsdir (get-fsdir url-obj))
          (infile (get-web-full-path url-obj))
          (outfile (get-fs-full-path url-obj)))
      (begin
        (make-directory* fsdir)
        (format/flush #t "steal ~s from ~s now ... " outfile host)
        (call-with-output-file outfile
          (lambda (out)
            (let ((response (call-with-values
                                (lambda ()
                                  (http-get host infile
                                            :sink out :flusher (lambda _ #t)))
                              list)))
              (format/flush #t "done\n")
              response)))))))

(define (content-type-of response)
  (if (null? response)
      "unknown-content-type"
      (cadr (assoc "content-type" (cadr response)))))

;;===================================================
;; special utility for http text.
;;
(define (refs-from-string str)
  (call/cc 
   (lambda (c)
     (let loop ((buff str)
                (refs '()))
       (if (string-incomplete? buff)
           (c '())
           (let ((match (#/(href=|src=)"(.*?)"/ buff)))
             (if match
                 (let ((url (match 2)))
                   (loop (match 'after) (cons url refs)))
                 (reverse refs))))))))

(define (collect-refs-from-file file)
  (call-with-input-file file
    (lambda (in)
      (let loop ((buff (read-line in #t))
                 (refs '()))
        (if (eof-object? buff)
            (reverse refs)
            (loop (read-line in #t) (append (refs-from-string buff) refs)))))))

(define (trans-refs refs default-url)
  (let ((*url* (break-down-url default-url)))
    (map (lambda (ref)
           (let ((url (break-down-url ref)))
             (construct-complete-url url *url*)))
         refs)))

(define (remove-false-url refs)
  (let loop ((refs refs)
             (result '()))
    (if (null? refs)
        (reverse result)
        (let ((item (car refs))
              (rest (cdr refs)))
          (if item
              (loop rest (cons item result))
              (loop rest result))))))

(define (sort-refs refs)
  (sort refs string<))

(define (unique-refs refs)
  (let loop ((refs refs)
             (result '())
             (prev #f))
    (if (null? refs)
        (reverse result)
        (let ((item (car refs))
              (rest (cdr refs)))
          (if (equal? prev item)
              (loop rest result item)
              (loop rest (cons item result) item))))))

;;===================================================
;; aliases for improvement of readability
;;
(define (illegal-url? url)
  (member #f (map (lambda (msg) (msg url))
                  (list get-hostname get-webdir get-filename))))

(define (type-is-html? response)
  (equal? "text/html"
          (content-type-of response)))
(define (child-of file parent-url)
  (unique-refs 
   (sort-refs 
    (remove-false-url 
     (trans-refs 
      (collect-refs-from-file file) parent-url)))))

;;===================================================
;; main logic
;;
(define (websteal url wait)
  (define %top-url% (break-down-url url))
  (if (illegal-url? %top-url%)
      (error "illegal url format found.")
      (if (file-exists? (get-fs-full-path %top-url%))
          'done
          (begin
            (sys-sleep wait)
            (if (type-is-html? (get-web-page-from url))
                (let ((url (get-url %top-url%))
                      (file (get-fs-full-path %top-url%)))
                  (let ((child-list (child-of file url)))
                    (for-each (lambda (url)
                                (websteal url *gentle-wait*))
                              child-list)))
                'not-html)))))

(define (main args)
  (websteal (cadr args) *no-wait*))


Last modified : 2007/02/21 04:37:59 UTC