{ (* ************************************************************** *) (* mbox-cleaner *) (* deletes superflous mails of mbox, which has same body. *) (* only ONE instance of such mails will survive *) (* -------------------------------------------------------------- *) (* Copyright: Oliver Bandel, Germany *) (* Program-Version: Version 1.0.0 (Sun Apr 3 18:26:19 CEST 2005) *) (* ************************************************************** *) (* No warranty on this program - use on your own risk. *) (* Redistribution under GPL, Vers. 2 is poossible. *) (* ************************************************************** *) (* Mailheader and Mailbody are splitted by one empty line. That empty line will be added to the header. So, if you want to reconstruct the mbox-file correctly, you only have to print the header, then the body. And all is going well! *) exception Eof exception Corrupted_mboxfile let lines = ref 1 let rest_of_last_scan = ref "" (* if we had read the begin of the next Mail, we save it her *) let header = ref "" let body = ref "" (* New stuff: using Buffer-module instead of string-append-operator! => speedup by factor 10! *) (* ------------------------------------------------------------------------------------------ *) let headbuf = Buffer.create 100_000 let bodybuf = Buffer.create 1_000_000 let append_header str = Buffer.add_string headbuf str let append_body str = Buffer.add_string bodybuf str let get_header () = Buffer.contents headbuf let get_body () = Buffer.contents bodybuf let clear_header () = Buffer.reset headbuf let clear_body () = Buffer.reset bodybuf } rule first_mailbegin = parse | "From " { append_header (Lexing.lexeme lexbuf); header_scan lexbuf } | _ { raise Corrupted_mboxfile} and header_scan = parse | [^ '\n' ]+ { append_header (Lexing.lexeme lexbuf); header_scan lexbuf} | "\n\n" { lines := !lines + 2; append_header "\n\n"; body_scan lexbuf } | "\n" { lines := !lines + 1; append_header "\n" ; header_scan lexbuf} | eof { raise Corrupted_mboxfile } (* not allowed to end the file in the header! *) and body_scan = parse | [^'\n']+ { append_body (Lexing.lexeme lexbuf); body_scan lexbuf } | "\nFrom " { lines := !lines + 1; append_body "\n"; rest_of_last_scan := "From " } (* !!! "\n" ist vom letzten BODY !!! *) | '\n' { lines := !lines + 1; append_body (Lexing.lexeme lexbuf); body_scan lexbuf } | eof { (*print_endline "_***_MAILBOXFILE_ZUENDE_***_";*) raise Eof } { let get_mail () = let h = get_header() and b = get_body () in (* Header *) clear_header(); append_header !rest_of_last_scan; rest_of_last_scan := ""; (* Body *) clear_body(); (h,b) (* -------------------------------------------- *) (* this function creates an md5sum in hex-ascii *) (* -------------------------------------------- *) let hex_bodydigest_of_mail m = match m with (_,b) -> Digest.to_hex (Digest.string b) (* =============================================================== *) (* this function reads in the mbox-data from the channel "ch", *) (* scans the data and split's it up to a list of header-body-pairs *) (* it only collects mails into the result-list, if they are unique *) (* in body-contents! *) (* =============================================================== *) let collect_uniq ch = let list_of_mails = ref [] in (* for the result *) let digest_htbl = Hashtbl.create 10_000 in (* the md5-hashtbl *) let lexed_mail = ref ("","") in (* here goes the current mail *) (* -------------------------------------------------------------------------------------- *) (* checks if mail's body is unique and if so, add mail to the list and update the hashtbl *) (* -------------------------------------------------------------------------------------- *) let add_mail_if_uniq () = let md5 = ref "" in (* here goes the current md5 of mail's body *) lexed_mail := get_mail(); (* read mail *) md5 := hex_bodydigest_of_mail !lexed_mail; (* md5 of mailbody *) if (Hashtbl.mem digest_htbl !md5) (* conditional: if mail is known *) then (* then *) () (* ignore *) else (* else *) begin Hashtbl.add digest_htbl !md5 ' '; (* make it known *) list_of_mails := !lexed_mail :: !list_of_mails (* add it to the list *) end in let () = let lexbuf = Lexing.from_channel ch in try (* FIRST Mail -> only accept "^From " direct at begin of file *) (* ---------------------------------------------------------- *) first_mailbegin lexbuf; (* entrypoint lexer *) lexed_mail := get_mail(); (* read first mail *) list_of_mails := !lexed_mail :: !list_of_mails; (* put mail on list *) Hashtbl.add digest_htbl (hex_bodydigest_of_mail !lexed_mail) ' '; (* update hashtbl *) (* for all other mails: handle them equally *) (* ---------------------------------------- *) while true do header_scan lexbuf; (* entrypoint lexer *) add_mail_if_uniq () (* put mail on list, if unknown/uniq *) done (* when eof was reached, there always is data of one mail waiting for you... *) (* ------------------------------------------------------------------------- *) with Eof -> add_mail_if_uniq () (* put mail on list, if unknown/uniq *) in List.rev !list_of_mails (* reverse the list, because it was built reverse *) }