```(*
** Course: Concepts of Programming Languages (BU CAS CS 320)
** Semester: Summer I, 2009
** Instructor: Hongwei Xi (hwxi AT cs DOT bu DOT edu)
*)

//
// Author: Hongwei Xi (hwxi AT cs DOT bu DOT edu)
// Time: Wednesday, June 24, 2009
//

(* ****** ****** *)

//
// Listing words according to the number of times they are
// used in a given text. Those that are most frequently used
// are listed first.
//

(* ****** ****** *)

// How to compile:
//   atscc code-2009-06-24.dats funbinheap.dats

(* ****** ****** *)

#include "BUCASCS320.hats"

(* ****** ****** *)

(* ****** ****** *)

typedef cmp (elt: t@ype) = (elt, elt) -<fun> int
typedef natLt (n: int) = [i:nat | i < n] int (i)

extern fun{elt:t@ype} bsearch_cloref {n:nat}
(f: natLt n -<cloref> elt, n: size_t n, v: elt, cmp: cmp elt)
:<> [i:int | ~1 <= i; i < n] int i

implement{elt} bsearch_cloref {n}
(f, n, v, cmp) = loop (0, n-1) where {
typedef res_t = [i:int | ~1 <= i; i < n] int i
fun loop
{i,j:int | 0 <= i; i <= j+1; j+1 <= n} .<j-i+1>.
(lb: int i, ub: int j):<cloref> res_t =
if lb <= ub then let
val m = lb + (ub - lb) / 2; val sgn = cmp (v, f (m))
in
if sgn >= 0 then loop (m+1, ub) else loop (lb, m-1)
end else ub
// end of [loop]
val n = int1_of_size1 (n)
} // end of [bsearch_cloref]

(* ****** ****** *)

extern fun wordbeg_get {n:nat}
(wordarr: array (string, n), n: size_t n, word: string): int
// end of [wordbeg_get]

implement wordbeg_get (wordarr, n, word) = let
fn cmp (s1: string, s2: string):<> int = let
val sgn = compare (s1, s2) in if sgn > 0 then 1 else ~1
end // end of [cmp]
in
bsearch_cloref (lam (i) => \$effmask_ref (wordarr[i]), n, word, cmp)
end // end of [wordbeg_get]

extern fun wordend_get {n:nat}
(wordarr: array (string, n), n: size_t n, word: string): int
// end of [wordend_get]

implement wordend_get (wordarr, n, word) = let
fn cmp (s1: string, s2: string):<> int = let
val s1 = string1_of_string s1
val s2 = string1_of_string s2
fun loop {n1,n2:nat}
{i:nat | i <= n1; i <= n2} .<n1-i>.
(s1: string n1, s2: string n2, i: size_t i):<> Sgn =
if string_is_atend (s1, i) then
if string_is_atend (s2, i) then 1
else let
val c2 = s2[i]
in
if char_isalpha (c2) then ~1 else 1
end (* end of [if] *)
else
if string_is_atend (s2, i) then 1
else let
val sgn = compare (s1[i], s2[i])
in
if sgn <> 0 then sgn else loop (s1, s2, i+1)
end
// end of [loop]
in
loop (s1, s2, 0)
end // end of [cmp]
in
bsearch_cloref (lam (i) => \$effmask_ref (wordarr[i]), n, word, cmp)
end // end of [wordend_get]

extern fun wordcnt_get {n:nat}
(wordarr: array (string, n), n: size_t n, word: string): int
// end of [wordcnt_get]

implement wordcnt_get (wordarr, n, word) = let
val n0 = wordbeg_get (wordarr, n, word) and n1 = wordend_get (wordarr, n, word)
in
n1 - n0
end // end of [wordcnt_get]

(* ****** ****** *)

extern fun theText_initialize (
buf: &string? >> string n
, nbuf: &size_t? >> size_t n
, wordarr: &array? >> array (string, n1)
, nwordarr: &size_t? >> size_t n1
) : #[n:nat;n1:nat] void = "theText_initialize"
// end of [theText_initialize]

(* ****** ****** *)

fn fprint_first_word
(out: FILEref, s: string): void = loop (0) where {
val [n:int] s = string1_of_string (s)
fun loop {i:nat | i <= n} (i: size_t i):<cloref1> void =
if string_isnot_atend (s, i) then let
val c = s[i]
in
if char_isalpha (c) then (fprint_char (out, s[i]); loop (i+1))
end // end of [if]
// end of [loop]
} // end of [fprint_first_word]

(* ****** ****** *)

fn first_word_get (s: string): string = let
val [n:int] s = string1_of_string (s)
fun loop {i:nat | i <= n} (i: size_t i):<cloref1> sizeLte n =
if string_is_atend (s, i) then i else
(if char_isalpha (s[i]) then loop (i+1) else i)
// end of [loop]
val sbp = string_make_substring (s, 0, loop 0)
in
string1_of_strbuf (sbp)
end // end of [first_word_get]

(* ****** ****** *)

local

typedef elt = @(
int(*count*), string(*word*)
) // end of [elt]

val cmp = lam (x1: elt, x2: elt): Sgn =<cloref> compare (x2.0, x1.0)

fn wordcounthp_build {n:nat}
(wordarr: array (string, n), n: size_t n)
: fbinhp (elt) = loop (res, 0, ntot) where {
typedef res_t = fbinhp (elt)
val res = funbinheap_empty<> {elt} (); var ntot: int = 0
fun loop {i:nat} (res: res_t, i: size_t i, ntot: &int):<cloref1> res_t =
if i < n then let
val wrd = first_word_get (wordarr[i])
val cnt = wordcnt_get (wordarr, n, wrd)
val res = funbinheap_insert<elt> (res, @(cnt, wrd), cmp)
val cnt = int1_of_int (cnt)
val () = assert (cnt >= 1)
(*
val () = begin
prerrf ("%i:\t%s(%i)\n", @(ntot, wrd, cnt))
end // end of [val]
*)
val i = i + cnt
val () = ntot := ntot + 1
in
loop (res, i, ntot)
end else let
(*
val () = begin
prerr "wordcounthp_build: ntot = "; prerr ntot; prerr_newline ()
end // end of [val]
*)
in
res // loop exits
end (* end of [if] *)
// end of [loop]
} // end [wordcountheap_build]

in // in of [local]

fn fprint_used_words {n:nat} (
out: FILEref, wordarr: array (string, n), n: size_t n, nword: int
) : void = let
val hp = wordcounthp_build (wordarr, n)
val nhp = funbinheap_size (hp)
val () = fprintf (out, "The total number of words used is [%i].\n", @(nhp))
val () = loop (hp, nhp, 0) where {
fun loop {nhp,i:nat}
(res: fbinhp (elt, nhp), nhp: int nhp, i: int i):<cloref1> void =
if i < nword then begin
if nhp > 0 then let
var x: elt // uninitialized
val res = funbinheap_delmin<elt> (res, x, cmp)
val () = fprintf (out, "%s (%i)\n", @(x.1, x.0))
in
loop (res, nhp-1, i+1)
end else begin
// the heap is empty // loop exits
end // end of [if]
end else begin
// loop exits
end // end of [if]
} (* end of [val] *)
in
// empty
end // end of [fprint_used_words]

end // end of [local]

(* ****** ****** *)

(* ****** ****** *)

implement main () = () where {
var theTextBuf: string // uninitialized
var nbuf: size_t // uninitialized
var theTextWordArr: array (string, 0) // uninitialized
var nwordarr: size_t // uninitialized
val () = theText_initialize (theTextBuf, nbuf, theTextWordArr, nwordarr)
val theTextWordArr = theTextWordArr
val () = begin
print "theTextBuf_sz = "; print nbuf; print_newline ();
print "theTextWordArr_sz = "; print nwordarr; print_newline ();
end // end of [val]
(*
#define N 1000
val () = assert (nwordarr >= N)
val () = loop (size1_of_int1 0) where {
fun loop (i: sizeLte N):<cloref1> void = if i < N then let
val () = fprint_first_word (stdout_ref, array_get_elt_at (theTextWordArr, i))
val () = print_newline ()
in
loop (i+1)
end // end of [loop]
} (* end [val] *)
*)
val () = () where {
val (vbox pf_arr | p_arr) = array_get_view_ptr {string} (theTextWordArr)
val () = qsort {string} (!p_arr, nwordarr, sizeof<string>, lam (s1, s2) => compare (s1, s2))
} // end of [val]
val () = (print "sorting is finished.\n"; print_newline ())
(*
#define N 1000
val () = assert (nwordarr >= N)
val () = loop (size1_of_int1 0) where {
fun loop (i: sizeLte N):<cloref1> void = if i < N then let
val () = fprint_first_word (stdout_ref, array_get_elt_at (theTextWordArr, i))
val () = print_newline ()
in
loop (i+1)
end // end of [loop]
} (* end [val] *)
*)
(*
val ahab_cnt = wordcnt_get (theTextWordArr, nwordarr, "ahab")
val () = printf ("ahab_cnt = %i\n", @(ahab_cnt))
val ishmael_cnt = wordcnt_get (theTextWordArr, nwordarr, "ishmael")
val () = printf ("ishmael_cnt = %i\n", @(ishmael_cnt))
*)
// printing the top 500 words (with the numbers of times they are used)
val () = fprint_used_words (stdout_ref, theTextWordArr, nwordarr, 500)
} // end of [main]

(* ****** ****** *)

%{\$

#define CBUFSZ (16 * 1024 * 1024)
char theTextBuf[CBUFSZ] ;

#define WBUFSZ (1024 * 1024)
char *theTextWordArr[WBUFSZ] ;

ats_void_type
theText_initialize (
ats_ref_type p_buf
, ats_ref_type p_nbuf
, ats_ref_type p_wordarr
, ats_ref_type p_nwordarr
) {
int c ;
int iw ;
size_t i ;
char *s ;
size_t i1 ;
char **s1 ;
iw = 0 ;
s = theTextBuf ;
i = 0 ;
s1 = theTextWordArr ;
i1 = 0 ;
while (1) {
c = fgetc(stdin) ;
if (c == EOF) break ;
*s = tolower(c) ;
//
if (isalpha(c)) {
if (iw == 0) { iw = 1 ; *s1++ = s ; i1++ ; }
} else {
iw = 0 ;
} /* end of [if] */
//
s++ ; i++ ;
} /* end of [while] */

*s = '\000' ;

*((char**)p_buf) = theTextBuf ;
*((size_t*)p_nbuf) = i ;
*((char***)p_wordarr) = theTextWordArr ;
*((size_t*)p_nwordarr) = i1 ;

if (i >= CBUFSZ) {
fprintf (stderr, "exit: too many chars for processing\n") ; exit(1) ;
} /* end of [if] */
if (i1 >= WBUFSZ) {
fprintf (stderr, "exit: too many words for processing\n") ; exit(1) ;
} /* end of [if] */
return ;
} /* end of [theTextBuf_initialize] */

%}

(* end of [code-2009-06-24.dats] *)
```