{
(* ************************************************************** *)
(* mbox-cleaner                                                   *)
(* deletes superflous mails of mbox, which has same body.         *)
(* only ONE instance of such mails will survive                   *)
(* -------------------------------------------------------------- *)
(* Copyright: Oliver Bandel, Germany                              *)
(* Program-Version: Version 1.0.0 (Sun Apr  3 18:26:19 CEST 2005) *)
(* ************************************************************** *)
(* No warranty on this program - use on your own risk.            *)
(* Redistribution under GPL, Vers. 2 is poossible.                *)
(* ************************************************************** *)

(*
   Mailheader and Mailbody are splitted by one empty line.
   That empty line will be added to the header.
   So, if you want to reconstruct the mbox-file correctly, you
   only have to print the header, then the body.
   And all is going well!
*)

exception Eof
exception Corrupted_mboxfile

let lines = ref 1

let rest_of_last_scan   = ref "" (* if we had read the begin of the next Mail, we save it her *)
let header = ref ""
let body   = ref ""

(* New stuff: using Buffer-module instead of string-append-operator! => speedup by factor 10! *)
(* ------------------------------------------------------------------------------------------ *)
let headbuf = Buffer.create   100_000
let bodybuf = Buffer.create 1_000_000

let append_header str = Buffer.add_string headbuf str
let append_body   str = Buffer.add_string bodybuf str

let get_header () = Buffer.contents headbuf
let get_body   () = Buffer.contents bodybuf

let clear_header () = Buffer.reset headbuf
let clear_body   () = Buffer.reset bodybuf

}

rule first_mailbegin = parse
   | "From "   { append_header (Lexing.lexeme lexbuf); header_scan lexbuf }
   | _ { raise Corrupted_mboxfile}

and
  header_scan = parse
   | [^ '\n' ]+  { append_header (Lexing.lexeme lexbuf); header_scan lexbuf}
   | "\n\n"      { lines := !lines + 2; append_header "\n\n"; body_scan lexbuf }
   | "\n"        { lines := !lines + 1; append_header "\n" ; header_scan lexbuf}
   | eof         { raise Corrupted_mboxfile } (* not allowed to end the file in the header! *)

and
  body_scan = parse
   | [^'\n']+  { append_body (Lexing.lexeme lexbuf); body_scan lexbuf }
   | "\nFrom " { lines := !lines + 1; append_body "\n"; rest_of_last_scan := "From " } (* !!! "\n" ist vom letzten BODY !!! *)
   | '\n'      { lines := !lines + 1; append_body (Lexing.lexeme lexbuf); body_scan lexbuf }
   | eof       { (*print_endline "_***_MAILBOXFILE_ZUENDE_***_";*)  raise Eof }


{

let get_mail () = 
    let h = get_header() and
        b = get_body ()
        in 
          (* Header *)
            clear_header();
            append_header !rest_of_last_scan; 
            rest_of_last_scan := "";
          (* Body *)
            clear_body();
            (h,b)

(* -------------------------------------------- *)
(* this function creates an md5sum in hex-ascii *)
(* -------------------------------------------- *)
let hex_bodydigest_of_mail m = match m with
     (_,b) -> Digest.to_hex (Digest.string b)


(* =============================================================== *)
(* this function reads in the mbox-data from the channel "ch",     *)
(* scans the data and split's it up to a list of header-body-pairs *)
(* it only collects mails into the result-list, if they are unique *)
(* in body-contents!                                               *)
(* =============================================================== *)
let collect_uniq ch =
     let list_of_mails = ref [] in               (* for the result  *)
     let digest_htbl = Hashtbl.create 10_000 in  (* the md5-hashtbl *)
     let lexed_mail = ref ("","") in             (* here goes the current mail *)

     (* -------------------------------------------------------------------------------------- *)
     (* checks if mail's body is unique and if so, add mail to the list and update the hashtbl *)
     (* -------------------------------------------------------------------------------------- *)
     let add_mail_if_uniq () = 
             let md5 = ref "" in                         (* here goes the current md5 of mail's body *)
             lexed_mail := get_mail();                                (* read mail *)
             md5 := hex_bodydigest_of_mail !lexed_mail;                    (* md5 of mailbody *)

             if (Hashtbl.mem digest_htbl !md5)                   (* conditional: if mail is known *)
             then                                                 (* then        *)
               ()                                                 (*      ignore *)
             else                                                 (* else        *)
               begin
                 Hashtbl.add digest_htbl !md5 ' ';                (*      make it known      *)
                 list_of_mails := !lexed_mail :: !list_of_mails   (*      add it to the list *)
               end
     in

        let () = let lexbuf = Lexing.from_channel ch in
            try
                (* FIRST Mail -> only accept "^From " direct at begin of file *)
                (* ---------------------------------------------------------- *)
                    first_mailbegin lexbuf;                                       (* entrypoint lexer *)
                    lexed_mail := get_mail();                                     (* read first mail *)
                    list_of_mails :=  !lexed_mail :: !list_of_mails;               (* put mail on list *)
                    Hashtbl.add digest_htbl (hex_bodydigest_of_mail !lexed_mail) ' '; (* update hashtbl *)
    
                (* for all other mails: handle them equally *)
                (* ---------------------------------------- *)
                while true do
                  header_scan lexbuf;       (* entrypoint lexer *)
                  add_mail_if_uniq ()       (* put mail on list, if unknown/uniq *)
                done
    
                (* when eof was reached, there always is data of one mail waiting for you... *)
                (* ------------------------------------------------------------------------- *)
              with Eof -> add_mail_if_uniq () (* put mail on list, if unknown/uniq *)
        in
          List.rev !list_of_mails  (* reverse the list, because it was built reverse *)


}