(*
** Course: Concepts of Programming Languages (BU CAS CS 320)
** Semester: Summer I, 2009
** Instructor: Hongwei Xi (hwxi AT cs DOT bu DOT edu)
*)

//
// Author: Hongwei Xi (hwxi AT cs DOT bu DOT edu)
// Time: Wednesday, June 24, 2009
//

(* ****** ****** *)

//
// Listing words according to the number of times they are
// used in a given text. Those that are most frequently used
// are listed first. 
//

(* ****** ****** *)

// How to compile:
//   atscc code-2009-06-24.dats funbinheap.dats

(* ****** ****** *)

#include "BUCASCS320.hats"

(* ****** ****** *)

staload "libc/SATS/stdlib.sats"

(* ****** ****** *)

typedef cmp (elt: t@ype) = (elt, elt) -<fun> int
typedef natLt (n: int) = [i:nat | i < n] int (i)

extern fun{elt:t@ype} bsearch_cloref {n:nat}
  (f: natLt n -<cloref> elt, n: size_t n, v: elt, cmp: cmp elt)
  :<> [i:int | ~1 <= i; i < n] int i

implement{elt} bsearch_cloref {n}
  (f, n, v, cmp) = loop (0, n-1) where {
  typedef res_t = [i:int | ~1 <= i; i < n] int i
  fun loop
    {i,j:int | 0 <= i; i <= j+1; j+1 <= n} .<j-i+1>.
    (lb: int i, ub: int j):<cloref> res_t =
    if lb <= ub then let
      val m = lb + (ub - lb) / 2; val sgn = cmp (v, f (m))
    in
      if sgn >= 0 then loop (m+1, ub) else loop (lb, m-1) 
    end else ub
  // end of [loop]
  val n = int1_of_size1 (n)
} // end of [bsearch_cloref]

(* ****** ****** *)

extern fun wordbeg_get {n:nat}
  (wordarr: array (string, n), n: size_t n, word: string): int
// end of [wordbeg_get]

implement wordbeg_get (wordarr, n, word) = let
  fn cmp (s1: string, s2: string):<> int = let
    val sgn = compare (s1, s2) in if sgn > 0 then 1 else ~1
  end // end of [cmp]
in
  bsearch_cloref (lam (i) => $effmask_ref (wordarr[i]), n, word, cmp)
end // end of [wordbeg_get]

extern fun wordend_get {n:nat}
  (wordarr: array (string, n), n: size_t n, word: string): int
// end of [wordend_get]

implement wordend_get (wordarr, n, word) = let
  fn cmp (s1: string, s2: string):<> int = let
    val s1 = string1_of_string s1
    val s2 = string1_of_string s2
    fun loop {n1,n2:nat}
      {i:nat | i <= n1; i <= n2} .<n1-i>.
      (s1: string n1, s2: string n2, i: size_t i):<> Sgn =
      if string_is_atend (s1, i) then
        if string_is_atend (s2, i) then 1
        else let
          val c2 = s2[i]
        in
          if char_isalpha (c2) then ~1 else 1 
        end (* end of [if] *)
      else
        if string_is_atend (s2, i) then 1
        else let
          val sgn = compare (s1[i], s2[i])
        in
          if sgn <> 0 then sgn else loop (s1, s2, i+1)
        end
    // end of [loop]      
  in 
    loop (s1, s2, 0)
  end // end of [cmp]
in
  bsearch_cloref (lam (i) => $effmask_ref (wordarr[i]), n, word, cmp)
end // end of [wordend_get]

extern fun wordcnt_get {n:nat}
  (wordarr: array (string, n), n: size_t n, word: string): int
// end of [wordcnt_get]

implement wordcnt_get (wordarr, n, word) = let
  val n0 = wordbeg_get (wordarr, n, word) and n1 = wordend_get (wordarr, n, word)
in
  n1 - n0
end // end of [wordcnt_get]

(* ****** ****** *)

extern fun theText_initialize (
  buf: &string? >> string n
, nbuf: &size_t? >> size_t n
, wordarr: &array? >> array (string, n1)
, nwordarr: &size_t? >> size_t n1
) : #[n:nat;n1:nat] void = "theText_initialize"
// end of [theText_initialize]

(* ****** ****** *)

fn fprint_first_word
  (out: FILEref, s: string): void = loop (0) where {
  val [n:int] s = string1_of_string (s)
  fun loop {i:nat | i <= n} (i: size_t i):<cloref1> void =
    if string_isnot_atend (s, i) then let
      val c = s[i]
    in
      if char_isalpha (c) then (fprint_char (out, s[i]); loop (i+1))
    end // end of [if]
  // end of [loop]  
} // end of [fprint_first_word]

(* ****** ****** *)

fn first_word_get (s: string): string = let
  val [n:int] s = string1_of_string (s)
  fun loop {i:nat | i <= n} (i: size_t i):<cloref1> sizeLte n =
    if string_is_atend (s, i) then i else
      (if char_isalpha (s[i]) then loop (i+1) else i)
  // end of [loop]
  val sbp = string_make_substring (s, 0, loop 0)
in
  string1_of_strbuf (sbp)
end // end of [first_word_get]

(* ****** ****** *)

local

staload "funbinheap.dats"

typedef elt = @(
  int(*count*), string(*word*)
) // end of [elt]  

val cmp = lam (x1: elt, x2: elt): Sgn =<cloref> compare (x2.0, x1.0)

fn wordcounthp_build {n:nat}
  (wordarr: array (string, n), n: size_t n)
  : fbinhp (elt) = loop (res, 0, ntot) where {
  typedef res_t = fbinhp (elt) 
  val res = funbinheap_empty<> {elt} (); var ntot: int = 0
  fun loop {i:nat} (res: res_t, i: size_t i, ntot: &int):<cloref1> res_t =
    if i < n then let
      val wrd = first_word_get (wordarr[i])
      val cnt = wordcnt_get (wordarr, n, wrd)
      val res = funbinheap_insert<elt> (res, @(cnt, wrd), cmp)
      val cnt = int1_of_int (cnt)
      val () = assert (cnt >= 1)
(*
      val () = begin
        prerrf ("%i:\t%s(%i)\n", @(ntot, wrd, cnt))
      end // end of [val]
*)
      val i = i + cnt
      val () = ntot := ntot + 1
    in
      loop (res, i, ntot)
    end else let
(*
      val () = begin
        prerr "wordcounthp_build: ntot = "; prerr ntot; prerr_newline ()
      end // end of [val]  
*)
    in
      res // loop exits
    end (* end of [if] *)
  // end of [loop]
} // end [wordcountheap_build]

in // in of [local]

fn fprint_used_words {n:nat} (
   out: FILEref, wordarr: array (string, n), n: size_t n, nword: int
  ) : void = let
  val hp = wordcounthp_build (wordarr, n)
  val nhp = funbinheap_size (hp)
  val () = fprintf (out, "The total number of words used is [%i].\n", @(nhp))
  val () = loop (hp, nhp, 0) where {
    fun loop {nhp,i:nat}
      (res: fbinhp (elt, nhp), nhp: int nhp, i: int i):<cloref1> void =
      if i < nword then begin
        if nhp > 0 then let
          var x: elt // uninitialized
          val res = funbinheap_delmin<elt> (res, x, cmp)
          val () = fprintf (out, "%s (%i)\n", @(x.1, x.0))
        in
          loop (res, nhp-1, i+1)
        end else begin
          // the heap is empty // loop exits
        end // end of [if]
      end else begin
        // loop exits
      end // end of [if]  
  } (* end of [val] *)
in
  // empty
end // end of [fprint_used_words]

end // end of [local]

(* ****** ****** *)

dynload "funbinheap.dats"

(* ****** ****** *)

implement main () = () where {
  var theTextBuf: string // uninitialized
  var nbuf: size_t // uninitialized
  var theTextWordArr: array (string, 0) // uninitialized
  var nwordarr: size_t // uninitialized
  val () = theText_initialize (theTextBuf, nbuf, theTextWordArr, nwordarr)
  val theTextWordArr = theTextWordArr
  val () = begin
    print "theTextBuf_sz = "; print nbuf; print_newline ();
    print "theTextWordArr_sz = "; print nwordarr; print_newline ();
  end // end of [val]
(*
  #define N 1000
  val () = assert (nwordarr >= N)
  val () = loop (size1_of_int1 0) where {
    fun loop (i: sizeLte N):<cloref1> void = if i < N then let
      val () = fprint_first_word (stdout_ref, array_get_elt_at (theTextWordArr, i))
      val () = print_newline ()
    in
      loop (i+1)
    end // end of [loop]
  } (* end [val] *)
*)
  val () = () where {
    val (vbox pf_arr | p_arr) = array_get_view_ptr {string} (theTextWordArr)
    val () = qsort {string} (!p_arr, nwordarr, sizeof<string>, lam (s1, s2) => compare (s1, s2))
  } // end of [val]
  val () = (print "sorting is finished.\n"; print_newline ()) 
(*
  #define N 1000
  val () = assert (nwordarr >= N)
  val () = loop (size1_of_int1 0) where {
    fun loop (i: sizeLte N):<cloref1> void = if i < N then let
      val () = fprint_first_word (stdout_ref, array_get_elt_at (theTextWordArr, i))
      val () = print_newline ()
    in
      loop (i+1)
    end // end of [loop]
  } (* end [val] *)
*)
(*
  val ahab_cnt = wordcnt_get (theTextWordArr, nwordarr, "ahab")
  val () = printf ("ahab_cnt = %i\n", @(ahab_cnt))
  val ishmael_cnt = wordcnt_get (theTextWordArr, nwordarr, "ishmael")
  val () = printf ("ishmael_cnt = %i\n", @(ishmael_cnt))
*)
  // printing the top 500 words (with the numbers of times they are used)
  val () = fprint_used_words (stdout_ref, theTextWordArr, nwordarr, 500)
} // end of [main]

(* ****** ****** *)

%{$

#define CBUFSZ (16 * 1024 * 1024)
char theTextBuf[CBUFSZ] ;

#define WBUFSZ (1024 * 1024)
char *theTextWordArr[WBUFSZ] ;

ats_void_type
theText_initialize (
  ats_ref_type p_buf
, ats_ref_type p_nbuf
, ats_ref_type p_wordarr
, ats_ref_type p_nwordarr
) {
  int c ;
  int iw ;
  size_t i ;
  char *s ;
  size_t i1 ;
  char **s1 ;
  iw = 0 ;
  s = theTextBuf ;
  i = 0 ;
  s1 = theTextWordArr ;
  i1 = 0 ;
  while (1) {
    c = fgetc(stdin) ;
    if (c == EOF) break ;
    *s = tolower(c) ;
//
    if (isalpha(c)) {
      if (iw == 0) { iw = 1 ; *s1++ = s ; i1++ ; }
    } else {
      iw = 0 ;
    } /* end of [if] */
//
    s++ ; i++ ;
  } /* end of [while] */

  *s = '\000' ;

  *((char**)p_buf) = theTextBuf ;
  *((size_t*)p_nbuf) = i ;
  *((char***)p_wordarr) = theTextWordArr ;
  *((size_t*)p_nwordarr) = i1 ;

  if (i >= CBUFSZ) {
    fprintf (stderr, "exit: too many chars for processing\n") ; exit(1) ;
  } /* end of [if] */
  if (i1 >= WBUFSZ) {
    fprintf (stderr, "exit: too many words for processing\n") ; exit(1) ;
  } /* end of [if] */
  return ;
} /* end of [theTextBuf_initialize] */

%}

(* end of [code-2009-06-24.dats] *)