要はちゃんと文書全体をパーズしてなくて、 リンクとおぼしき部分にマッチするものを 決め打ちで引っ掛けてるので。
web サイトからコンテンツをごっそり引っ張って来る。
ルートになるファイルを指定して実行するとリンクをたどって
次々に持って来るようにしている。
もちろん別のホストへのリンクは引っ張らない。(想像するだけで恐いから...)
以下のように SICP とか t-y-scheme みたいなサイトを自分の端末上に
ごっそり欲しかったのが書いてみた動機。
HTML 関係あんまり詳しく勉強してないので、 try&error で作ってます。 これから勉強する予定cut-sea:2004/07/25 06:10:31 PDT
gosh> (assoc "content-type" (values-ref (http-get "mitpress.mit.edu" "/sicp/full-text/book/book.html") 1)) ("content-type" "text/html") gosh> (assoc "content-type" (values-ref (http-get "mitpress.mit.edu" "/sicp/full-text/book/book-Z-C.css") 1)) ("content-type" "text/css") gosh> (assoc "content-type" (values-ref (http-get "mitpress.mit.edu" "/sicp/full-text/book/cover.jpg") 1)) ("content-type" "image/jpeg")
(use htmlprag) (use sxml.sxpath) (define refs-from-shtml (node-or (sxpath '(// (or@ a area link) @ href *text*)) (sxpath '(// (or@ script input frame iframe img) @ src *text*)))) (define (collect-refs-from-file file) (call-with-input-file file (lambda (in) (refs-from-shtml (html->shtml in)))))
gosh> (load "./websteal.scm") #t gosh> (websteal "http://mitpress.mit.edu/sicp/full-text/book/book.html") getting file "sicp/full-text/book/book.html" from "mitpress.mit.edu" now ... done getting file "sicp/full-text/book/book-Z-C.css" from "mitpress.mit.edu" now ... done getting file "sicp/full-text/book/book-Z-H-1.html" from "mitpress.mit.edu" now ... done (snip) getting file "sicp/full-text/book/ch1-Z-G-4.gif" from "mitpress.mit.edu" now ... done getting file "sicp/full-text/book/ch1-Z-G-5.gif" from "mitpress.mit.edu" now ... done getting file "sicp/full-text/book/ch1-Z-G-6.gif" from "mitpress.mit.edu" now ... done getting file "sicp/full-text/book/cover.jpg" from "mitpress.mit.edu" now ... done #<undef> gosh>
":";exec gosh -b $0 "$@" ;; ;; Usage: ./websteal 'http://mitpress.mit.edu/sicp/full-text/book/book.html' ;; ;; $Id$ ;; ;;=================================================== ;; use library ;; (use rfc.http) (use rfc.uri) (use file.util) (use srfi-13) ;;=================================================== ;; global values ;; (define *no-wait* 0) (define *gentle-wait* 10) ;;=================================================== ;; miscellaneous functions ;; (define-syntax format/flush (syntax-rules () ((_ arg ...) (begin (format arg ...) (flush))))) ;;=================================================== ;; destruct/construct utility for URL and file name. ;; (define (root-cut path) (list->string (cdr (string->list path)))) (define (root-paste path) (string-append "/" path)) (define (from-root? path) (equal? #\/ (string-ref path 0))) (define (break-down-url url) (let ((url-head (values-ref (uri-scheme&specific url) 0)) (url-body (values-ref (uri-scheme&specific url) 1))) (let ((url-list (call-with-values (lambda _ (uri-decompose-hierarchical url-body)) list))) (let ((host (car url-list)) (dir (sys-dirname (cadr url-list))) (file (sys-basename (cadr url-list)))) (lambda (msg) (cond ((eq? msg 'url) url) ((eq? msg 'type) url-head) ((eq? msg 'body) url-body) ((eq? msg 'host) host) ((eq? msg 'file) file) ((eq? msg 'webdir) dir) ((eq? msg 'fsdir) (root-cut dir)) ((eq? msg 'web-full-path) (string-append dir (root-paste file))) ((eq? msg 'fs-full-path) (string-append (root-cut dir) (root-paste file))) (else (error "No such message permitted.")))))))) (define (get-type obj) (obj 'type)) (define (get-body obj) (obj 'body)) (define (get-url obj) (obj 'url)) (define (get-hostname obj) (obj 'host)) (define (get-filename obj) (obj 'file)) (define (get-webdir obj) (obj 'webdir)) (define (get-fsdir obj) (obj 'fsdir)) (define (get-web-full-path obj) (obj 'web-full-path)) (define (get-fs-full-path obj) (obj 'fs-full-path)) (define (construct-complete-url obj default-obj) (define (construct-url host dir file) (let ((tag "http://")) (string-append tag host dir (root-paste file)))) (call/cc (lambda (c) (let ((type (get-type obj)) (host% (get-hostname default-obj)) (dir% (get-webdir default-obj)) (host (get-hostname obj)) (dir (get-webdir obj))) (if (and (not (equal? "http" type)) (not (equal? #f type))) (c #f) (construct-url (if (eq? #f host) host% (if (equal? host% host) host (c #f))) (if (eq? #f host) (if (equal? "." dir) dir% (if (from-root? dir) dir (string-append dir% (root-paste dir)))) dir) (get-filename obj))))))) ;;=================================================== ;; utility to get web page. ;; return value is http-get multiple values as list. ;; (define (get-web-page-from url) (let ((url-obj (break-down-url url))) (let ((host (get-hostname url-obj)) (fsdir (get-fsdir url-obj)) (infile (get-web-full-path url-obj)) (outfile (get-fs-full-path url-obj))) (begin (make-directory* fsdir) (format/flush #t "steal ~s from ~s now ... " outfile host) (call-with-output-file outfile (lambda (out) (let ((response (call-with-values (lambda () (http-get host infile :sink out :flusher (lambda _ #t))) list))) (format/flush #t "done\n") response))))))) (define (content-type-of response) (if (null? response) "unknown-content-type" (cadr (assoc "content-type" (cadr response))))) ;;=================================================== ;; special utility for http text. ;; (define (refs-from-string str) (call/cc (lambda (c) (let loop ((buff str) (refs '())) (if (string-incomplete? buff) (c '()) (let ((match (#/(href=|src=)"(.*?)"/ buff))) (if match (let ((url (match 2))) (loop (match 'after) (cons url refs))) (reverse refs)))))))) (define (collect-refs-from-file file) (call-with-input-file file (lambda (in) (let loop ((buff (read-line in #t)) (refs '())) (if (eof-object? buff) (reverse refs) (loop (read-line in #t) (append (refs-from-string buff) refs))))))) (define (trans-refs refs default-url) (let ((*url* (break-down-url default-url))) (map (lambda (ref) (let ((url (break-down-url ref))) (construct-complete-url url *url*))) refs))) (define (remove-false-url refs) (let loop ((refs refs) (result '())) (if (null? refs) (reverse result) (let ((item (car refs)) (rest (cdr refs))) (if item (loop rest (cons item result)) (loop rest result)))))) (define (sort-refs refs) (sort refs string<)) (define (unique-refs refs) (let loop ((refs refs) (result '()) (prev #f)) (if (null? refs) (reverse result) (let ((item (car refs)) (rest (cdr refs))) (if (equal? prev item) (loop rest result item) (loop rest (cons item result) item)))))) ;;=================================================== ;; aliases for improvement of readability ;; (define (illegal-url? url) (member #f (map (lambda (msg) (msg url)) (list get-hostname get-webdir get-filename)))) (define (type-is-html? response) (equal? "text/html" (content-type-of response))) (define (child-of file parent-url) (unique-refs (sort-refs (remove-false-url (trans-refs (collect-refs-from-file file) parent-url))))) ;;=================================================== ;; main logic ;; (define (websteal url wait) (define %top-url% (break-down-url url)) (if (illegal-url? %top-url%) (error "illegal url format found.") (if (file-exists? (get-fs-full-path %top-url%)) 'done (begin (sys-sleep wait) (if (type-is-html? (get-web-page-from url)) (let ((url (get-url %top-url%)) (file (get-fs-full-path %top-url%))) (let ((child-list (child-of file url))) (for-each (lambda (url) (websteal url *gentle-wait*)) child-list))) 'not-html))))) (define (main args) (websteal (cadr args) *no-wait*))