#!  /usr/local/bin/gosh

;; 語の重みに tf-idf を用いた用いてテキストのコサイン類似度を
;; 調べる.
;
;; 空白か改行で語を区切れる程度の簡単な文章を2つ以上用意して
;; $ gosh ./similarity.scm input1 input2 ...
;; のように実験する

; 例えば Wikipediaからいくつかの記事を wikiディレクトリに保存して
; 
; $ gosh ./similarity.scm wiki/*
; wiki/Aiura wiki/Linux 0.0054991185818617765
; wiki/Aiura wiki/Macos 0.002611462566896136
; wiki/Aiura wiki/Trisquel 0.003894850295124982
; wiki/Aiura wiki/Windows 0.0035955775448793274
; wiki/Aiura wiki/Yuyushiki 0.4289658365289811
; wiki/Linux wiki/Macos 0.135492604558405
; wiki/Linux wiki/Trisquel 0.1167040839600922
; wiki/Linux wiki/Windows 0.11053283743818382
; wiki/Linux wiki/Yuyushiki 0.005589308305096626
; wiki/Macos wiki/Trisquel 0.0170007870784677
; wiki/Macos wiki/Windows 0.10521518337390891
; wiki/Macos wiki/Yuyushiki 0.0032617734817390353
; wiki/Trisquel wiki/Windows 0.01082203311131606
; wiki/Trisquel wiki/Yuyushiki 0.00462516926987522
; wiki/Windows wiki/Yuyushiki 0.00353391844504245

;; コサイン類似度であるから2つのテキストの類似度として0以上1以下の
;; 実数が出力されて,数字が大きいほど類似度が高いことを言う


(use srfi-13)

;; filename -> hash-table(word->tf)
(define (get-tokens f)
  (define words
    (call-with-input-file f (lambda (port)
    (string-tokenize (port->string port) #[\w]))))
  (define N (length words))
  (define A (make-hash-table 'equal?))

  ;; count words and div by N
  (let1 /N (/ N)
  (for-each (cut hash-table-update! A <> (cut + /N <>) 0) words))

  A)

(define (main args)

  (let1 files (cdr args)
  (define N (length files))
  (when (< N 2) (error 'need-more-2-files))

  ;; hash-table (filename -> hash-table(word->tf))
  (let1 ht
      (apply hash-table 'equal?
        (map (lambda (f) (cons f (get-tokens f))) files))

  ;; word -> idf ; with memo
  (define idf

    (let1 memo (make-hash-table 'equal?)
    (define (get-idf w) ; without memo
      (log (/ N
              (apply +
                (hash-table-map ht (lambda (_ A)
                  (if (hash-table-exists? A w) 1 0)))))))

    (lambda (w)
      (let1 x (hash-table-get memo w #f)
      (cond (x x)
            (else
              (let1 ret (get-idf w)
              ;(format #t "idf of ~a = ~a\n" w ret)
              (hash-table-put! memo w ret)
              ret)) )))
    ))

  ;; (filename, filename) -> [0, 1]
  (define (cosine-distance f1 f2)
    (let ((A1 (hash-table-get ht f1))
          (A2 (hash-table-get ht f2)))
    (let ((abs-A1 0) (abs-A2 0) (A1*A2 0))
    (hash-table-for-each A1 (lambda (w tf)
      (let1 idf. (idf w)
      (set! A1*A2 (+ A1*A2 (* tf (hash-table-get A2 w 0) idf. idf.)))
      (set! abs-A1 (+ abs-A1 (* tf tf idf. idf.))))))
    (hash-table-for-each A2 (lambda (w tf)
      (let1 idf. (idf w)
      (set! abs-A2 (+ abs-A2 (* tf tf idf. idf.))))))
    (/ A1*A2 (sqrt abs-A1) (sqrt abs-A2)))))

  ;; for all combination, figure out cosine-distance
  (let for ((ls files))
    (cond ((< (length ls) 2) 'done)
          (else
              (let1 f1 (car ls)
              (for-each (lambda (f2)
                (let1 dis (cosine-distance f1 f2)
                  (format #t "~a ~a ~a\n" f1 f2 dis)))
                (cdr ls)))
              (for (cdr ls)))))
  ))
  0)